In [1]:
from copy import deepcopy
import numpy as np
import torch
from torch.optim import Adam
import pybulletgym
import gym
import time
import spinup.algos.pytorch.lstm_ddpg.core as core
from spinup.utils.logx import EpochLogger
import itertools

In [2]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [121]:
class ReplayBuffer:
    """
    A simple FIFO experience replay buffer for agents.
    """

    def __init__(self, obs_dim, act_dim, max_size):
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.max_size = max_size
        self.obs_buf = np.zeros(core.combined_shape(max_size, obs_dim), dtype=np.float32)
        self.obs2_buf = np.zeros(core.combined_shape(max_size, obs_dim), dtype=np.float32)
        self.act_buf = np.zeros(core.combined_shape(max_size, act_dim), dtype=np.float32)
        self.rew_buf = np.zeros(max_size, dtype=np.float32)
        self.done_buf = np.zeros(max_size, dtype=np.float32)
        self.ptr, self.size = 0, 0

    def store(self, obs, act, rew, next_obs, done):
        self.obs_buf[self.ptr] = obs
        self.act_buf[self.ptr] = act
        self.rew_buf[self.ptr] = rew
        self.obs2_buf[self.ptr] = list(next_obs)
        self.done_buf[self.ptr] = done
        self.ptr = (self.ptr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)

    def sample_batch(self, batch_size=32):
        idxs = np.random.randint(0, self.size, size=batch_size)
        batch = dict(obs=self.obs_buf[idxs],
                     obs2=self.obs2_buf[idxs],
                     act=self.act_buf[idxs],
                     rew=self.rew_buf[idxs],
                     done=self.done_buf[idxs])
        return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()}
    
    def sample_segment_batch(self, batch_size=32, max_seg_len=10):
        idxs = np.random.randint(max_seg_len, self.size, size=batch_size)
        # Segment
        if max_seg_len < 1:
            raise ValueError("max_seg_len=={}".format(max_seg_len))
        obs_seg = np.zeros([batch_size, max_seg_len, self.obs_dim])
        act_seg = np.zeros([batch_size, max_seg_len, self.act_dim])
        obs2_seg = np.zeros([batch_size, max_seg_len, self.obs_dim])
        act2_seg = np.zeros([batch_size, max_seg_len, self.act_dim])
        rew_seg = np.zeros([batch_size, max_seg_len])
        done_seg = np.zeros([batch_size, max_seg_len])
        seg_len = max_seg_len * np.ones(batch_size)
        for seg_i in range(max_seg_len):
            obs_seg[:, -1-seg_i, :] = self.obs_buf[idxs-seg_i, :]
            act_seg[:, -1-seg_i, :] = self.act_buf[idxs-seg_i, :]
            obs2_seg[:, -1-seg_i, :] = self.obs2_buf[idxs-seg_i, :]
            rew_seg[:, -1-seg_i] = self.rew_buf[idxs-seg_i]
            done_seg[:, -1-seg_i] = self.done_buf[idxs-seg_i]
        # If there is done in the backward experiences, only consider the experiences after the last done.
        for batch_i in range(batch_size):
            done_idxs_exclude_last_exp = np.where(done_seg[batch_i][:-1] == 1)  # Exclude last experience
            # If exist done
            if done_idxs_exclude_last_exp[0].size != 0:
                largest_done_id = done_idxs_exclude_last_exp[0][-1]
                seg_len[batch_i] = max_seg_len - (largest_done_id+1)

                # Only keep experiences after the last done
                obs_keep_part = np.copy(obs_seg[batch_i, largest_done_id+1:, :])
                act_keep_part = np.copy(act_seg[batch_i, largest_done_id+1:, :])
                obs2_keep_part = np.copy(obs2_seg[batch_i, largest_done_id+1:, :])
                rew_keep_part = np.copy(rew_seg[batch_i, largest_done_id+1:])
                done_keep_part = np.copy(done_seg[batch_i, largest_done_id+1:])
                
                # Set to 0 to make sure all experiences are at the beginning
                obs_seg[batch_i] = np.zeros([max_seg_len, self.obs_dim])
                act_seg[batch_i] = np.zeros([max_seg_len, self.act_dim])
                obs2_seg[batch_i] = np.zeros([max_seg_len, self.obs_dim])
                rew_seg[batch_i] = np.zeros([max_seg_len])
                done_seg[batch_i] = np.zeros([max_seg_len])

                # Move kept experiences to the start of the segment
                obs_seg[batch_i, :max_seg_len-(largest_done_id+1), :] = obs_keep_part
                act_seg[batch_i, :max_seg_len-(largest_done_id+1), :] = act_keep_part
                obs2_seg[batch_i, :max_seg_len-(largest_done_id+1), :] = obs2_keep_part
                rew_seg[batch_i, :max_seg_len-(largest_done_id+1)] = rew_keep_part
                done_seg[batch_i, :max_seg_len-(largest_done_id+1)] = done_keep_part
        act2_seg[:, :-1] = act_seg[:, 1:]
        batch = dict(obs=self.obs_buf[idxs],
                     obs2=self.obs2_buf[idxs],
                     act=self.act_buf[idxs],
                     rew=self.rew_buf[idxs],
                     done=self.done_buf[idxs],
                     obs_seg=obs_seg,
                     act_seg=act_seg,
                     obs2_seg=obs2_seg,
                     act2_seg=act2_seg,
                     rew_seg=rew_seg,
                     done_seg=done_seg,
                     seg_len=seg_len)
        return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()}
    
    def sample_batch_with_history(self, batch_size=32, max_hist_len=100):
        idxs = np.random.randint(max_hist_len, self.size, size=batch_size)
        # History 
        if max_hist_len == 0:
            hist_obs = np.zeros([batch_size, 1, self.obs_dim])
            hist_act = np.zeros([batch_size, 1, self.act_dim])
            hist_obs2 = np.zeros([batch_size, 1, self.obs_dim])
            hist_act2 = np.zeros([batch_size, 1, self.act_dim])
            hist_rew = np.zeros([batch_size, 1])
            hist_done = np.zeros([batch_size, 1])
            hist_len = np.zeros(batch_size)
#             hist_msk = np.tile((hist_len!=0).astype(float).reshape([-1,1]), [1, 12]).shape
        else:
            hist_obs = np.zeros([batch_size, max_hist_len, self.obs_dim])
            hist_act = np.zeros([batch_size, max_hist_len, self.act_dim])
            hist_obs2 = np.zeros([batch_size, max_hist_len, self.obs_dim])
            hist_act2 = np.zeros([batch_size, max_hist_len, self.act_dim])
            hist_rew = np.zeros([batch_size, max_hist_len])
            hist_done = np.zeros([batch_size, max_hist_len])
            hist_len = max_hist_len * np.ones(batch_size)
            for hist_i in range(max_hist_len):
                hist_obs[:, -1-hist_i, :] = self.obs_buf[idxs-hist_i-1, :]
                hist_act[:, -1-hist_i, :] = self.act_buf[idxs-hist_i-1, :]
                hist_obs2[:, -1-hist_i, :] = self.obs2_buf[idxs-hist_i-1, :]
                hist_act2[:, -1-hist_i, :] = self.act_buf[idxs-hist_i, :]  # include a_t
                hist_rew[:, -1-hist_i] = self.rew_buf[idxs-hist_i-1]
                hist_done[:, -1-hist_i] = self.done_buf[idxs-hist_i-1]
            # If there is done in the backward experiences, only consider the experiences after the last done.
            for batch_i in range(batch_size):
                done_idxs_exclude_last_exp = np.where(hist_done[batch_i][:-1] == 1)  # Exclude last experience
                # If exist done
                if done_idxs_exclude_last_exp[0].size != 0:
                    largest_done_id = done_idxs_exclude_last_exp[0][-1]
                    hist_len[batch_i] = max_hist_len - (largest_done_id+1)

                    # Only keep experiences after the last done
                    obs_keep_part = np.copy(hist_obs[batch_i, largest_done_id+1:, :])
                    act_keep_part = np.copy(hist_act[batch_i, largest_done_id+1:, :])
                    obs2_keep_part = np.copy(hist_obs2[batch_i, largest_done_id+1:, :])
                    act2_keep_part = np.copy(hist_act2[batch_i, largest_done_id+1:, :])
                    rew_keep_part = np.copy(hist_rew[batch_i, largest_done_id+1:])
                    done_keep_part = np.copy(hist_done[batch_i, largest_done_id+1:])

                    # Set to 0 to make sure all experiences are at the beginning
                    hist_obs[batch_i] = np.zeros([max_hist_len, self.obs_dim])
                    hist_act[batch_i] = np.zeros([max_hist_len, self.act_dim])
                    hist_obs2[batch_i] = np.zeros([max_hist_len, self.obs_dim])
                    hist_act2[batch_i] = np.zeros([max_hist_len, self.act_dim])
                    hist_rew[batch_i] = np.zeros([max_hist_len])
                    hist_done[batch_i] = np.zeros([max_hist_len])

                    # Move kept experiences to the start of the segment
                    hist_obs[batch_i, :max_hist_len-(largest_done_id+1), :] = obs_keep_part
                    hist_act[batch_i, :max_hist_len-(largest_done_id+1), :] = act_keep_part
                    hist_obs2[batch_i, :max_hist_len-(largest_done_id+1), :] = obs2_keep_part
                    hist_act2[batch_i, :max_hist_len-(largest_done_id+1), :] = act2_keep_part
                    hist_rew[batch_i, :max_hist_len-(largest_done_id+1)] = rew_keep_part
                    hist_done[batch_i, :max_hist_len-(largest_done_id+1)] = done_keep_part
        # 
        batch = dict(obs=self.obs_buf[idxs],
                     obs2=self.obs2_buf[idxs],
                     act=self.act_buf[idxs],
                     rew=self.rew_buf[idxs],
                     done=self.done_buf[idxs],
                     hist_obs=hist_obs,
                     hist_act=hist_act,
                     hist_obs2=hist_obs2,
                     hist_act2=hist_act2,
                     hist_rew=hist_rew,
                     hist_done=hist_done,
                     hist_len=hist_len)
        return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()}
        

## Single Stream

In [103]:
class Critic(nn.Module):
    def __init__(self, obs_dim, act_dim,
                 pre_lstm_hid_sizes=(128, 128), lstm_hid_sizes=(128, 128), post_lstm_hid_sizes=(128, 128)):
        super(Critic, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        # 
        self.pre_lstm_layers = nn.ModuleList()
        self.lstm_layers = nn.ModuleList()
        self.post_lstm_layers = nn.ModuleList()
        # Pre-LSTM
        pre_lstm_layer_sizes = [obs_dim+act_dim] + list(pre_lstm_hid_sizes)
        for pre_lstm_h in range(len(pre_lstm_layer_sizes)-1):
            self.pre_lstm_layers += [nn.Linear(pre_lstm_layer_sizes[pre_lstm_h], 
                                               pre_lstm_layer_sizes[pre_lstm_h+1]),
                                     nn.ReLU()]
        # LSTM
        lstm_layer_sizes = [pre_lstm_layer_sizes[-1]] + list(lstm_hid_sizes)
        for lstm_h in range(len(lstm_layer_sizes)-1):
            self.lstm_layers += [nn.LSTM(lstm_layer_sizes[lstm_h], lstm_layer_sizes[lstm_h+1], batch_first=True)]
        # Post-LSTM
        post_lstm_layer_sizes = [lstm_layer_sizes[-1]] + list(post_lstm_hid_sizes) + [1]
        for post_lstm_h in range(len(post_lstm_layer_sizes)-2):
            self.post_lstm_layers += [nn.Linear(post_lstm_layer_sizes[pre_lstm_h], 
                                                post_lstm_layer_sizes[pre_lstm_h+1]),
                                      nn.ReLU()]
        self.post_lstm_layers += [nn.Linear(post_lstm_layer_sizes[-2], post_lstm_layer_sizes[-1]), nn.Identity()]
    
    def forward(self, obs_seg, act_seg, seg_len, out_len=1):
        x = torch.cat([obs_seg, act_seg], dim=-1)
        # Pre-LSTM
        for layer in self.pre_lstm_layers:
            x = layer(x)
        # LSTM
        for layer in self.lstm_layers:
             x, (lstm_hidden_state, lstm_cell_state) = layer(x)
        # Post-LSTM
        for layer in self.post_lstm_layers:
            x = layer(x)
        out = torch.gather(x, 1, (seg_len-1).view(-1,1).unsqueeze(1).long()).squeeze(1)
        return out
    
class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit,
                 pre_lstm_hid_sizes=(128, 128), lstm_hid_sizes=(128, 128), post_lstm_hid_sizes=(128, 128)):
        super(Actor, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.act_limit = act_limit
        
        # 
        self.pre_lstm_layers = nn.ModuleList()
        self.lstm_layers = nn.ModuleList()
        self.post_lstm_layers = nn.ModuleList()
        
        # Pre-LSTM
        pre_lstm_layer_sizes = [obs_dim] + list(pre_lstm_hid_sizes)
        for pre_lstm_h in range(len(pre_lstm_layer_sizes)-1):
            self.pre_lstm_layers += [nn.Linear(pre_lstm_layer_sizes[pre_lstm_h], 
                                               pre_lstm_layer_sizes[pre_lstm_h+1]),
                                     nn.ReLU()]
        # LSTM
        lstm_layer_sizes = [pre_lstm_layer_sizes[-1]] + list(lstm_hid_sizes)
        for lstm_h in range(len(lstm_layer_sizes)-1):
            self.lstm_layers += [nn.LSTM(lstm_layer_sizes[lstm_h], lstm_layer_sizes[lstm_h+1], batch_first=True)]
        # Post-LSTM
        post_lstm_layer_sizes = [lstm_layer_sizes[-1]] + list(post_lstm_hid_sizes) + [act_dim]
        for post_lstm_h in range(len(post_lstm_layer_sizes)-2):
            self.post_lstm_layers += [nn.Linear(post_lstm_layer_sizes[pre_lstm_h], 
                                                post_lstm_layer_sizes[pre_lstm_h+1]),
                                      nn.ReLU()]
        self.post_lstm_layers += [nn.Linear(post_lstm_layer_sizes[-2], post_lstm_layer_sizes[-1]), nn.Tanh()]
        
    def forward(self, obs_seg, seg_len, out_len=1):
        x = torch.cat([obs_seg], dim=-1)
        # Pre-LSTM
        for layer in self.pre_lstm_layers:
            x = layer(x)
        # LSTM
        for layer in self.lstm_layers:
             x, (lstm_hidden_state, lstm_cell_state) = layer(x)
        # Post-LSTM
        for layer in self.post_lstm_layers:
            x = layer(x)
        out = torch.gather(x, 1, (seg_len-1).view(-1,1).repeat(1, self.act_dim).unsqueeze(1).long()).squeeze(1)
        return self.act_limit * out

class ActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit,
                 critic_pre_lstm_hid_sizes=(128, 128), critic_lstm_hid_sizes=(128, 128), critic_post_lstm_hid_sizes=(128, 128),
                 actor_pre_lstm_hid_sizes=(128, 128), actor_lstm_hid_sizes=(128, 128), actor_post_lstm_hid_sizes=(128, 128)):
        super(ActorCritic, self).__init__()
        self.q1 = Critic(obs_dim, act_dim, 
                         pre_lstm_hid_sizes=critic_pre_lstm_hid_sizes, 
                         lstm_hid_sizes=critic_lstm_hid_sizes, 
                         post_lstm_hid_sizes=critic_post_lstm_hid_sizes)
        self.q2 = Critic(obs_dim, act_dim,  
                         pre_lstm_hid_sizes=critic_pre_lstm_hid_sizes, 
                         lstm_hid_sizes=critic_lstm_hid_sizes, 
                         post_lstm_hid_sizes=critic_post_lstm_hid_sizes)
        self.pi = Actor(obs_dim, act_dim, act_limit=1,
                        pre_lstm_hid_sizes=actor_pre_lstm_hid_sizes, 
                        lstm_hid_sizes=actor_lstm_hid_sizes, 
                        post_lstm_hid_sizes=actor_post_lstm_hid_sizes)
    
    def act(self, obs_seg, seg_len):
        with torch.no_grad():
            return self.pi(obs_seg, seg_len).cpu().numpy() 

In [98]:
critic = Critic(5,5)
actor = Actor(5,5,1)
ac = ActorCritic(5,5,1)

In [153]:
obs_seg = np.random.rand(10, 10, 5)
act_seg = np.random.rand(10, 10, 5)
seg_len = 10*np.ones(10)

In [154]:
act_seg[:, -1, :] = 0
# act_seg

In [157]:
obs_seg = torch.as_tensor(obs_seg, dtype=torch.float32)
act_seg = torch.as_tensor(act_seg, dtype=torch.float32)
seg_len = torch.as_tensor(seg_len, dtype=torch.float32)

In [165]:
torch.arange(0, 10)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [167]:
act_seg[torch.arange(0, 10), (seg_len-1).long()] = 

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

In [160]:
torch.gather(act_seg, 1, (seg_len-1).view(-1,1).repeat(1,5).unsqueeze(1).long()).squeeze(1) = np.random.rand(10, 5)

SyntaxError: can't assign to function call (<ipython-input-160-877757e7c6a8>, line 1)

In [162]:
(seg_len-1).view(-1,1).repeat(1,5).unsqueeze(1).long().shape

torch.Size([10, 1, 5])

In [164]:
act_seg[:, (seg_len-1).view(-1,1).repeat(1,5).unsqueeze(1).long(), :].shape


torch.Size([10, 10, 1, 5, 5])

In [137]:
list(seg_len.astype(int))

[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]

In [93]:
obs_seg = torch.as_tensor(obs_seg, dtype=torch.float32)
act_seg = torch.as_tensor(act_seg, dtype=torch.float32)
seg_len = torch.as_tensor(seg_len, dtype=torch.float32)


In [99]:
out = critic(obs_seg, act_seg, seg_len)
out = actor(obs_seg, seg_len)
ac.act(obs_seg, seg_len)
out.shape


torch.Size([10, 5])

In [89]:
torch.gather(out, 1, (seg_len-1).view(-1,1).repeat(1, 5).unsqueeze(1).long()).squeeze(1)

tensor([[-0.0326, -0.0106, -0.0763,  0.0901, -0.0656],
        [-0.0326, -0.0104, -0.0759,  0.0904, -0.0652],
        [-0.0323, -0.0107, -0.0761,  0.0906, -0.0650],
        [-0.0327, -0.0105, -0.0763,  0.0901, -0.0655],
        [-0.0325, -0.0107, -0.0761,  0.0902, -0.0654],
        [-0.0326, -0.0105, -0.0762,  0.0902, -0.0653],
        [-0.0326, -0.0105, -0.0762,  0.0904, -0.0654],
        [-0.0325, -0.0105, -0.0762,  0.0900, -0.0655],
        [-0.0320, -0.0109, -0.0764,  0.0904, -0.0650],
        [-0.0322, -0.0107, -0.0765,  0.0902, -0.0653]],
       grad_fn=<SqueezeBackward1>)

In [45]:
out_len = 3
# np.arange((seg_len -1) - out_len + 1, (seg_len -1))
(seg_len -1) - out_len + 1

tensor([7., 7., 7., 7., 7., 7., 7., 7., 7., 7.])

In [57]:
[
    [
        [7,7,7]
    ],
    [
        [8,8,8]
    ]
]

[[[7, 7, 7]], [[8, 8, 8]]]

In [63]:
for i in range(int(s_l-1-out_len+1), int(s_l)):
    print(i)

7
8
9


In [65]:
for s_l in seg_len:
    for i in range(int(s_l-1-out_len+1), int(s_l)):
        print(i)
        
#     print(np.arange(s_l-1-out_len+1, s_l+1))
# torch.arange([7], [9+1])

7
8
9
7
8
9
7
8
9
7
8
9
7
8
9
7
8
9
7
8
9
7
8
9
7
8
9
7
8
9


In [71]:
(seg_len).view(-1,1).unsqueeze(1).long()
# torch.gather(out, 1, (seg_len-1).view(-1,1).unsqueeze(1).long()).squeeze(1)

tensor([[[10]],

        [[10]],

        [[10]],

        [[10]],

        [[10]],

        [[10]],

        [[10]],

        [[10]],

        [[10]],

        [[10]]])

In [36]:
torch.gather(out, 1, (seg_len-1).view(-1,1).repeat(1, 1).unsqueeze(1).long()).squeeze(1)

tensor([[0.0737],
        [0.0735],
        [0.0737],
        [0.0733],
        [0.0737],
        [0.0736],
        [0.0737],
        [0.0736],
        [0.0737],
        [0.0736]], grad_fn=<SqueezeBackward1>)

In [20]:
obs_seg.shape
act_seg.shape
x = torch.cat([obs_seg, act_seg], dim=-1)

## Partial Observable Envs

* **OpenAI Gym**
    * **MuJoCo**
      1. [HalfCheetah-v3](https://github.com/openai/gym/blob/master/gym/envs/mujoco/half_cheetah_v3.py)
         [HalfCheetah-v2](https://github.com/openai/gym/blob/master/gym/envs/mujoco/half_cheetah.py)
        * observation: (d=17)
          * position: 1-8 (d=8)
          * velocity: 9-17 (d=9)
      2. [Ant-v3](https://github.com/openai/gym/blob/master/gym/envs/mujoco/ant_v3.py)
         [Ant-v2](https://github.com/openai/gym/blob/master/gym/envs/mujoco/ant.py)
        * observation: (d=111)
          * position: 1-13 (d=13)
          * velocity: 14-27 (d=14)
          * cfrc_ext: 28-111 (d=64)
      3. [Walker2d-v3](https://github.com/openai/gym/blob/master/gym/envs/mujoco/walker2d_v3.py)
         [Walker2d-v2](https://github.com/openai/gym/blob/master/gym/envs/mujoco/walker2d.py)
        * observation: (d=17)
          * position: 1-8 (d=8)
          * velocity: 9-17 (d=9)
      4. [Hopper-v3](https://github.com/openai/gym/blob/master/gym/envs/mujoco/hopper_v3.py)
         [Hopper-v2]()
        * observation: (d=11)
          * position: 1-5 (d=5)
          * velocity: 6-11 (d=6)
      5. [InvertedPendulum-v2](https://github.com/openai/gym/blob/master/gym/envs/mujoco/inverted_pendulum.py)
        * observation: (d=4)
          * position: 1-2 (d=2)
          * velocity: 3-4 (d=2)
      6. [InvertedDoublePendulum-v2](https://github.com/openai/gym/blob/master/gym/envs/mujoco/inverted_double_pendulum.py)
        * observation: (d=11)
          * cart position: 1 
          * link angles sin: 2-3
          * link angles cos: 4-5 
          * link velocity: 6-8 (d=3)
          * qfrc_constraint: 9-11 (d=3)
      7. [Swimmer-v3](https://github.com/openai/gym/blob/master/gym/envs/mujoco/swimmer_v3.py)
         [Swimmer-v2](https://github.com/openai/gym/blob/master/gym/envs/mujoco/walker2d.py)
        * observation: (d=8)
          * position: 1-3 (d=3)
          * velocity: 4-8 (d=5)
      8. [Thrower-v2](https://github.com/openai/gym/blob/master/gym/envs/mujoco/thrower.py)
        * observation: (d=23)
          * position: 1-7 (d=7)
          * velocity: 8-14 (d=7)
          * get_body_com("r_wrist_roll_link"): 15-17 (d=3)
          * get_body_com("ball"): 18-20 (d=3)
          * get_body_com("goal"): 21-23 (d=3)
      9. [Striker-v2](https://github.com/openai/gym/blob/master/gym/envs/mujoco/striker.py)
        * observation: (d=23)
          * position: 1-7 (d=7)
          * velocity: 8-14 (d=7)
          * get_body_com("tips_arm"): 15-17 (d=3)
          * get_body_com("object"): 18-20 (d=3)
          * get_body_com("goal"): 21-23 (d=3)
      10. [Pusher-v2](https://github.com/openai/gym/blob/master/gym/envs/mujoco/pusher.py)
        * observation: (d=23)
          * position: 1-7 (d=7)
          * velocity: 8-14 (d=7)
          * get_body_com("tips_arm"): 15-17 (d=3)
          * get_body_com("object"): 18-20 (d=3)
          * get_body_com("goal"): 21-23 (d=3)
      11. [Reacher-v2](https://github.com/openai/gym/blob/master/gym/envs/mujoco/reacher.py)
        * observation: (d=11)
          * cos: 1-2 (d=2)
          * sin: 3-4 (d=2)
          * position: 5-6 (d=2)
          * velocity: 7-8 (d=2)
          * get_body_com("fingertip")-get_body_com("target"): 9-11 (d=3)
      12. [Humanoid-v3](https://github.com/openai/gym/blob/master/gym/envs/mujoco/humanoid_v3.py)
          [Humanoid-v2](https://github.com/openai/gym/blob/master/gym/envs/mujoco/humanoid.py)
        * observation: (d=376)
          * position: 1-22 (d=22)
          * velocity: 23-45 (d=23)
          * com_inertia: 46-185 (d=140)
          * com_velocity: 186-269 (d=84)
          * actuator_forces: 270-292 (d=23)
          * external_contact_forces: 293-376 (d=84)
      13. [HumanoidStandup-v2](https://github.com/openai/gym/blob/master/gym/envs/mujoco/humanoidstandup.py)
       * observation: (d=376)
          * position: 1-22 (d=22)
          * velocity: 23-45 (d=23)
          * com_inertia: 46-185 (d=140)
          * com_velocity: 186-269 (d=84)
          * actuator_forces: 270-292 (d=23)
          * external_contact_forces: 293-376 (d=84)

* **PyBulletGym**
    * **RoboSchool Envs**
      1. [HalfCheetahPyBulletEnv-v0](https://github.com/benelot/pybullet-gym/blob/master/pybulletgym/envs/roboschool/robots/locomotors/walker_base.py)
        * observation: (d=26)
          * more: (d=8)
            * distance at z: 1
            * angle_to_target sin: 2
            * angle_to_target cos: 3
            * velocity x: 4
            * velocity y: 5
            * velocity z: 6
            * roll: 7
            * pitch: 8
          * position: 9-20 (d=12)
          * feet contact: 21-26 (d=6)
      2. [AntPyBulletEnv-v0]()
        * observation: (d=28)
          * more: (d=8)
            * distance at z: 1
            * angle_to_target sin: 2
            * angle_to_target cos: 3
            * velocity x: 4
            * velocity y: 5
            * velocity z: 6
            * roll: 7
            * pitch: 8
          * position: 9-24 (d=16)
          * feet contact: 25-28 (d=4)

      3. [Walker2DPyBulletEnv-v0]()
        * observation: (d=22)
          * more: (d=8)
            * distance at z: 1
            * angle_to_target sin: 2
            * angle_to_target cos: 3
            * velocity x: 4
            * velocity y: 5
            * velocity z: 6
            * roll: 7
            * pitch: 8
          * position: 9-20 (d=12)
          * feet contact: 21-22 (d=2)

      4. [HopperPyBulletEnv-v0]()
        * observation: (d=15)
          * more: (d=8)
            * distance at z: 1
            * angle_to_target sin: 2
            * angle_to_target cos: 3
            * velocity x: 4
            * velocity y: 5
            * velocity z: 6
            * roll: 7
            * pitch: 8
          * position: 9-14 (d=6)
          * feet contact: 15 (d=1)

      5. [InvertedPendulumPyBulletEnv-v0](https://github.com/benelot/pybullet-gym/blob/master/pybulletgym/envs/roboschool/robots/pendula/interted_pendulum.py)
        * observation: (d=5)
          * slider x: 1
          * slider velocity x: 2
          * cos: 3
          * sin: 4
          * theta_dot: 5

      6. [InvertedDoublePendulumPyBulletEnv-v0](https://github.com/benelot/pybullet-gym/blob/master/pybulletgym/envs/roboschool/robots/pendula/inverted_double_pendulum.py)
       * observation: (d=9)
          * slider x: 1
          * slider velocity x: 2
          * pole2 x: 3
          * j1 cos: 4
          * j1 sin: 5
          * j1 dot: 6
          * j2 cos: 7
          * j2 sin: 8
          * j2 dot: 9

      7. [ReacherPyBulletEnv-v0](https://github.com/benelot/pybullet-gym/blob/master/pybulletgym/envs/roboschool/robots/manipulators/reacher.py)
        * observation: (d=9)
          * target x: 1
          * target y: 2
          * to_target_vec 1: 3
          * to_target_vec 2: 4
          * central_joint cos: 5
          * central_joint sin: 6
          * central_joint dot: 7
          * elbow_joint gamma: 8
          * elbow_joint gamma dot: 9

    * **MuJoCo Envs**
      1. [HalfCheetahMuJoCoEnv-v0](https://github.com/benelot/pybullet-gym/blob/master/pybulletgym/envs/mujoco/robots/locomotors/half_cheetah.py)
        * observation: (d=17)
          * position: 1-8 (d=8)
          * velocity: 9-17 (d=9)
      2. [AntMuJoCoEnv-v0](https://github.com/benelot/pybullet-gym/blob/master/pybulletgym/envs/mujoco/robots/locomotors/ant.py)
        * observation: (d=111)
          * position: 1-13 (d=13)
          * velocity: 14-27 (d=14)
          * cfrc_ext: 28-111 (d=64) (The cfrc_ext is set to zeros in PyBulletGym.) (The cfrc_ext are the external forces (force x,y,z and torque x,y,z) applied to each of the links at the center of mass. For the Ant, this is 14*6: the ground link, the torso link, and 12 links for all legs (3 links for each leg))
      3. [Walker2DMuJoCoEnv-v0](https://github.com/benelot/pybullet-gym/blob/master/pybulletgym/envs/mujoco/robots/locomotors/walker2d.py)
        * observation: (d=17)
          * position: 1-8 (d=8)
          * velocity: 9-17 (d=9)
      4. [HopperMuJoCoEnv-v0](https://github.com/benelot/pybullet-gym/blob/master/pybulletgym/envs/mujoco/robots/locomotors/hopper.py)
        * observation: (d=15)
          * position: 1-7 (d=7)
          * velocity: 8-15 (d=8)
      5. [InvertedPendulumMuJoCoEnv-v0](https://github.com/benelot/pybullet-gym/blob/master/pybulletgym/envs/mujoco/robots/pendula/inverted_pendulum.py)
        * observation: (d=4)
          * position: 1-3 (d=3)
          * velocity: 4 (d=1)
      6. [InvertedDoublePendulumMuJoCoEnv-v0](https://github.com/benelot/pybullet-gym/blob/master/pybulletgym/envs/mujoco/robots/pendula/inverted_double_pendulum.py)
        * observation: (d=11)
          * cart position: 1 
          * link angles sin: 2-3
          * link angles cos: 4-5
          * link velocity: 6-8
          * qfrc_constraint: 9-11


## Wrape Environment

In [109]:
class POMDPWrapper(gym.ObservationWrapper):
    def __init__(self, env_name):
        super().__init__(gym.make(env_name))
        
        # Remove velocity info
        # OpenAIGym
        #  1. MuJoCo
        if env_name == "HalfCheetah-v3" or env_name == "HalfCheetah-v2":
            self.remain_obs_idx = np.arange(0, 8)
        elif env_name == "Ant-v3" or env_name == "Ant-v2":
            self.remain_obs_idx = list(np.arange(0, 13)) + list(np.arange(27, 111))
        elif env_name == 'Walker2d-v3' or env_name == "Walker2d-v2":
            self.remain_obs_idx = np.arange(0, 8)
        elif env_name == 'Hopper-v3' or env_name == "Hopper-v2":
            self.remain_obs_idx = np.arange(0, 5)
        elif env_name == "InvertedPendulum-v2":
            self.remain_obs_idx = np.arange(0, 2)
        elif env_name == "InvertedDoublePendulum-v2":
            self.remain_obs_idx = list(np.arange(0, 5)) + list(np.arange(8, 11))
        elif env_name == "Swimmer-v3" or env_name == "Swimmer-v2":
            self.remain_obs_idx = np.arange(0, 3)
        elif env_name == "Thrower-v2":
            self.remain_obs_idx = list(np.arange(0, 7)) + list(np.arange(14, 23))
        elif env_name == "Striker-v2":
            self.remain_obs_idx = list(np.arange(0, 7)) + list(np.arange(14, 23))
        elif env_name == "Pusher-v2":
            self.remain_obs_idx = list(np.arange(0, 7)) + list(np.arange(14, 23))
        elif env_name == "Reacher-v2":
            self.remain_obs_idx = list(np.arange(0, 6)) + list(np.arange(8, 11))
        elif env_name == 'Humanoid-v3' or env_name == "Humanoid-v2":
            self.remain_obs_idx = list(np.arange(0, 22)) + list(np.arange(45, 185)) + list(np.arange(269, 376))
        elif env_name == 'HumanoidStandup-v2':
            self.remain_obs_idx = list(np.arange(0, 22)) + list(np.arange(45, 185)) + list(np.arange(269, 376))
        # PyBulletGym
        #  1. MuJoCo
        elif env_name == 'HalfCheetahMuJoCoEnv-v0':
            self.remain_obs_idx = np.arange(0, 8)
        elif env_name == 'AntMuJoCoEnv-v0':
            self.remain_obs_idx = list(np.arange(0, 13)) + list(np.arange(27, 111))
        elif env_name == 'Walker2DMuJoCoEnv-v0':
            self.remain_obs_idx = np.arange(0, 8)
        elif env_name == 'HopperMuJoCoEnv-v0':
            self.remain_obs_idx = np.arange(0, 7)
        elif env_name == 'InvertedPendulumMuJoCoEnv-v0':
            self.remain_obs_idx = np.arange(0, 3)
        elif env_name == 'InvertedDoublePendulumMuJoCoEnv-v0':
            self.remain_obs_idx = list(np.arange(0, 5)) + list(np.arange(8, 11))
        #  2. Roboschool
        elif env_name == 'HalfCheetahPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,26)) - set(np.arange(3,6)))
        elif env_name ==  'AntPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,28)) - set(np.arange(3,6)))
        elif env_name == 'Walker2DPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,22)) - set(np.arange(3,6)))
        elif env_name == 'HopperPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,15)) - set(np.arange(3,6)))
        elif env_name == 'InvertedPendulumPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,5)) - set([1,4]))
        elif env_name == 'InvertedDoublePendulumPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,9)) - set([1,5,8]))
        elif env_name == 'ReacherPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,9)) - set([6,8]))
        else:
            raise ValueError('POMDP for {} is not defined!'.format(env_name))
            
        # Redefine observation_space
        obs_low = np.array([-np.inf for i in range(len(self.remain_obs_idx))], dtype="float32")
        obs_high = np.array([np.inf for i in range(len(self.remain_obs_idx))], dtype="float32")
        self.observation_space = gym.spaces.Box(obs_low, obs_high)
        
    def observation(self, obs):
        return obs.flatten()[self.remain_obs_idx]
    

## With Gated Memory but Without World Model

In [182]:
def td3(env_name, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 
        steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 
        polyak=0.995, pi_lr=1e-3, q_lr=1e-3, 
        batch_size=100, max_seg_len=100,
        start_steps=1000, 
        update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, 
        noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, 
        nonstationary_env = True,
        gravity_change_pattern = 'gravity_averagely_equal',
        partially_observable = False,
        freeze_hist_coding = False,
        logger_kwargs=dict(), save_freq=1):
    """
    Twin Delayed Deep Deterministic Policy Gradient (TD3)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of 
            observations as inputs, and ``q1`` and ``q2`` should accept a batch 
            of observations and a batch of actions as inputs. When called, 
            these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current 
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target 
            policy.

        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.

        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    # Wrapper environment if using POMDP
    if partially_observable == True:
        env, test_env = POMDPWrapper(env_name), POMDPWrapper(env_name)
    else:
        env, test_env = gym.make(env_name), gym.make(env_name)
    obs_dim = env.observation_space.shape[0]   
    act_dim = env.action_space.shape[0]
    
    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = ActorCritic(obs_dim, act_dim, act_limit,
                     critic_pre_lstm_hid_sizes=(128, ), 
                     critic_lstm_hid_sizes=(128, ), 
                     critic_post_lstm_hid_sizes=(128, ),
                     actor_pre_lstm_hid_sizes=(128, ), 
                     actor_lstm_hid_sizes=(128, ),
                     actor_post_lstm_hid_sizes=(128, ))
    ac_targ = deepcopy(ac)
    ac.cuda()
    ac_targ.cuda()

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False
        
    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())
    
    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, max_size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n'%var_counts)
    
    # Set up function for computing TD3 Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']
        obs_seg, act_seg, obs2_seg, act2_seg = data['obs_seg'], data['act_seg'], data['obs2_seg'], data['act2_seg']
        rew_seg, done_seg, seg_len = data['rew_seg'], data['done_seg'], data['seg_len']
        
        q1 = ac.q1(obs_seg, act_seg, seg_len)
        q2 = ac.q2(obs_seg, act_seg, seg_len)

        # Bellman backup for Q functions
        with torch.no_grad():
            pi_targ = ac_targ.pi(obs2_seg, seg_len)
            
            # Target policy smoothing
            epsilon = torch.randn_like(pi_targ) * target_noise
            epsilon = torch.clamp(epsilon, -noise_clip, noise_clip)
            a2 = pi_targ + epsilon
            a2 = torch.clamp(a2, -act_limit, act_limit)
            act_seg[torch.arange(0, len(seg_len)), (seg_len-1).long()] = a2
            # Target Q-values
            q1_pi_targ = ac_targ.q1(obs2_seg, act2_seg, seg_len)
            q2_pi_targ = ac_targ.q2(obs2_seg, act2_seg, seg_len)

            q_pi_targ = q1_pi_targ
            backup = r + gamma * (1 - d) * q_pi_targ

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        loss_info = dict(Q1Vals=q1.detach().cpu().numpy(),
                         Q2Vals=q2.detach().cpu().numpy())

        return loss_q, loss_info

    # Set up function for computing TD3 pi loss
    def compute_loss_pi(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']
        obs_seg, act_seg, obs2_seg, act2_seg = data['obs_seg'], data['act_seg'], data['obs2_seg'], data['act2_seg']
        rew_seg, done_seg, seg_len = data['rew_seg'], data['done_seg'], data['seg_len']
        import pdb
        pdb.set_trace()
        act_seg[torch.arange(0, len(seg_len)), (seg_len-1).long()] = ac.pi(obs_seg, seg_len)
        import pdb
        pdb.set_trace()
        q1_pi = ac.q1(obs_seg, act_seg, seg_len)
        return -q1_pi.mean()

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    q_optimizer = Adam(q_params, lr=q_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data, timer):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **loss_info)

        # Possibly update pi and target networks
        if timer % policy_delay == 0:

#             # Freeze Q-networks so you don't waste computational effort 
#             # computing gradients for them during the policy learning step.
#             for p in q_params:
#                 p.requires_grad = False

            # Next run one gradient descent step for pi.
            pi_optimizer.zero_grad()
            loss_pi = compute_loss_pi(data)
            loss_pi.backward()
            pi_optimizer.step()

#             # Unfreeze Q-networks so you can optimize it at next DDPG step.
#             for p in q_params:
#                 p.requires_grad = True

            # Record things
            logger.store(LossPi=loss_pi.item())

            # Finally, update target networks by polyak averaging.
            with torch.no_grad():
                for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                    # NB: We use an in-place operations "mul_", "add_" to update target
                    # params, as opposed to "mul" and "add", which would make new tensors.
                    p_targ.data.mul_(polyak)
                    p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o_buff, o_buff_len, noise_scale):
        seg_o = torch.as_tensor(o_buff).view(1, o_buff.shape[0], o_buff.shape[1]).float().cuda()
        seg_l = torch.as_tensor([o_buff_len]).float().cuda()
        with torch.no_grad(): 
            a = ac.act(seg_o, seg_l).reshape(act_dim)
        a += noise_scale * np.random.randn(act_dim)
        if a.shape[0]!=act_dim:
            import pdb
            pdb.set_trace()
        return np.clip(a, -act_limit, act_limit)
    
    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            
            if max_seg_len>0:
                o_buff = np.zeros([max_seg_len, obs_dim])
                o_buff[0,:] = o
                o_buff_len = 1
                
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                a = get_action(o_buff, o_buff_len, 0)
                o2, r, d, _ = test_env.step(a)
                
                ep_ret += r
                ep_len += 1
                # Add short history
                if max_seg_len != 0:
                    if o_buff_len == max_seg_len:
                        o_buff[:max_seg_len-1] = o_buff[1:]
                        o_buff[max_seg_len-1] = list(o2)
                    else:
                        o_buff[o_buff_len+1-1] = list(o2)
                        o_buff_len += 1
                o = o2
                
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
        
    if max_seg_len>0:
        o_buff = np.zeros([max_seg_len, obs_dim])
        o_buff[0,:] = o
        o_buff_len = 1

    # Main loop: collect experience in env and update/log each epoch
    start_time = time.time()
    for t in range(total_steps):
        if t%200 == 0:
            end_time = time.time()
            print("t={}, {}s".format(t, end_time-start_time))
            start_time = end_time
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards, 
        # use the learned policy (with some noise, via act_noise). 
        if t > start_steps:
            a = get_action(o_buff, o_buff_len, act_noise)
        else:
            a = env.action_space.sample()
        
        if nonstationary_env == True:
            gravity_cycle = 1000
            gravity_base = -9.81
            if gravity_change_pattern == 'gravity_averagely_equal':
                gravity = gravity_base * 1 / 2 * (np.cos(2 * np.pi / gravity_cycle * t) + 1) + gravity_base / 2
            elif gravity_change_pattern == 'gravity_averagely_easier':
                gravity = gravity_base * 1 / 2 * (np.cos(2 * np.pi / gravity_cycle * t) + 1)
            elif gravity_change_pattern == 'gravity_averagely_harder':
                gravity = gravity_base * 1 / 2 * (-np.cos(2 * np.pi / gravity_cycle * t) + 1) + gravity_base
            else:
                pass

            if 'PyBulletEnv' in env_name:
                env.env._p.setGravity(0, 0, gravity)
            elif 'Roboschool' in env_name:
                pass
            else:
                env.model.opt.gravity[2] = gravity
        
        # Step the env
        o2, r, d, _ = env.step(a)
            
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)
        
        # Add short history
        if max_seg_len != 0:
            if o_buff_len == max_seg_len:
                o_buff[:max_seg_len-1] = o_buff[1:]
                o_buff[max_seg_len-1] = list(o2)
            else:
                o_buff[o_buff_len+1-1] = list(o2)
                o_buff_len += 1
        
        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2
        
        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0
                
            if max_seg_len>0:
                o_buff = np.zeros([max_seg_len, obs_dim])
                o_buff[0,:] = o
                o_buff_len = 1
        
        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_segment_batch(batch_size, max_seg_len=max_seg_len)
                batch = {k: v.cuda() for k,v in batch.items()}
                update(data=batch, timer=j)

        # End of epoch handling
        if (t+1) % steps_per_epoch == 0:
            epoch = (t+1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()

In [183]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_seg_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'single_stream_RTD3_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_seg_len=args['max_seg_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\single_stream_RTD3_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze\single_stream_RTD3_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetah-v2",
    "epochs":	50,
    "exp_name":	"single_stream_RTD3_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000002EA2399AA88>":	{
            "epoch_dict":	{},
            "exp_name":	"single_stream_RTD3_POMDP_HalfCheetah_PreLSTM1L128

(Pdb) test = ac.pi(obs_seg, seg_len)
(Pdb) test
tensor([[ 0.0279,  0.0878, -0.0482, -0.0296, -0.0751,  0.0949],
        [ 0.0185,  0.0958, -0.0603, -0.0384, -0.0921,  0.1073],
        [ 0.0223,  0.0961, -0.0524, -0.0323, -0.0793,  0.0974],
        [ 0.0234,  0.0851, -0.0494, -0.0288, -0.0791,  0.0933],
        [ 0.0265,  0.0840, -0.0527, -0.0251, -0.0755,  0.0928],
        [ 0.0220,  0.0866, -0.0556, -0.0295, -0.0824,  0.0994],
        [ 0.0281,  0.0836, -0.0550, -0.0263, -0.0796,  0.0971],
        [ 0.0314,  0.0726, -0.0530, -0.0275, -0.0765,  0.0958],
        [ 0.0289,  0.0825, -0.0529, -0.0260, -0.0738,  0.0944],
        [ 0.0244,  0.0903, -0.0515, -0.0285, -0.0819,  0.0981],
        [ 0.0294,  0.0802, -0.0527, -0.0267, -0.0796,  0.0957],
        [ 0.0212,  0.0860, -0.0517, -0.0279, -0.0808,  0.0943],
        [ 0.0186,  0.0932, -0.0531, -0.0295, -0.0840,  0.1000],
        [ 0.0220,  0.0821, -0.0554, -0.0254, -0.0882,  0.0927],
        [ 0.0244,  0.0781, -0.0489, -0.0231, -0.0801,  0

BdbQuit: 

In [178]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_seg_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'pre_feature_extraction_gated_lstm_DDPG_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_seg_len=args['max_seg_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\pre_feature_extraction_gated_lstm_DDPG_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze\pre_feature_extraction_gated_lstm_DDPG_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetah-v2",
    "epochs":	50,
    "exp_name":	"pre_feature_extraction_gated_lstm_DDPG_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000002EA371A30C8>":	{
            "epoch_dict":	{},
            "e



t=800, 0.035902976989746094s
t=1000, 0.037900447845458984s
t=1200, 6.796823978424072s
t=1400, 7.088076591491699s
t=1600, 6.938413381576538s
t=1800, 7.83489465713501s
t=2000, 8.34070086479187s
t=2200, 7.900867938995361s
t=2400, 7.201749086380005s
t=2600, 7.388243913650513s
t=2800, 7.907245874404907s
t=3000, 6.924487352371216s
t=3200, 7.850051641464233s
t=3400, 7.679458141326904s
t=3600, 7.562774181365967s
t=3800, 7.350314140319824s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -157 |
|          StdEpRet |            72.9 |
|          MaxEpRet |           -69.1 |
|          MinEpRet |            -269 |
|  AverageTestEpRet |           -42.6 |
|      StdTestEpRet |           0.478 |
|      MaxTestEpRet |           -41.8 |
|      MinTestEpRet |           -43.3 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           4e+03 |
|     AverageQ1Vals |          -0.907 |
|    

---------------------------------------
|             Epoch |               6 |
|      AverageEpRet |            -411 |
|          StdEpRet |            73.2 |
|          MaxEpRet |            -297 |
|          MinEpRet |            -501 |
|  AverageTestEpRet |            -516 |
|      StdTestEpRet |            91.7 |
|      MaxTestEpRet |            -243 |
|      MinTestEpRet |            -566 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.4e+04 |
|     AverageQ1Vals |           -7.24 |
|         StdQ1Vals |           0.397 |
|         MaxQ1Vals |           -6.21 |
|         MinQ1Vals |            -8.2 |
|     AverageQ2Vals |           -7.24 |
|         StdQ2Vals |           0.397 |
|         MaxQ2Vals |           -6.19 |
|         MinQ2Vals |            -8.2 |
|            LossPi |            7.23 |
|             LossQ |           0.309 |
|              Time |            30.4 |
---------------------------------------


t=44000, 35.23973727226257s
t=44200, 8.822436332702637s
t=44400, 9.144517660140991s
t=44600, 9.083711385726929s
t=44800, 8.894216060638428s
t=45000, 8.970010042190552s
t=45200, 8.684805154800415s
t=45400, 8.639894723892212s
t=45600, 8.694721460342407s
t=45800, 8.70970892906189s
t=46000, 8.693780183792114s
t=46200, 8.661808729171753s
t=46400, 8.527198553085327s
t=46600, 8.661836385726929s
t=46800, 8.58304738998413s
t=47000, 9.257245779037476s
t=47200, 9.578420639038086s
t=47400, 8.235968828201294s
t=47600, 7.254591941833496s
t=47800, 7.303483486175537s
---------------------------------------
|             Epoch |              12 |
|      AverageEpRet |           -49.5 |
|          StdEpRet |            88.3 |
|          MaxEpRet |            54.9 |
|          MinEpRet |            -189 |
|  AverageTestEpRet |            -223 |
|      StdTestEpRet |            14.5 |
|      MaxTestEpRet |            -196 |
|      MinTestEpRet |            -241 |
|             EpLen |           1e+03 |
| 

t=66800, 7.882946729660034s
t=67000, 7.685453176498413s
t=67200, 7.785175323486328s
t=67400, 7.900877237319946s
t=67600, 8.007587194442749s
t=67800, 7.488973617553711s
---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |             210 |
|          StdEpRet |             175 |
|          MaxEpRet |             416 |
|          MinEpRet |           -16.7 |
|  AverageTestEpRet |             979 |
|      StdTestEpRet |              40 |
|      MaxTestEpRet |        1.04e+03 |
|      MinTestEpRet |             906 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.8e+04 |
|     AverageQ1Vals |           -19.7 |
|         StdQ1Vals |            0.22 |
|         MaxQ1Vals |           -19.1 |
|         MinQ1Vals |           -20.3 |
|     AverageQ2Vals |           -19.7 |
|         StdQ2Vals |           0.221 |
|         MaxQ2Vals |           -19.1 |
|         MinQ2Vals |           

t=88000, 30.354488134384155s
t=88200, 7.610250234603882s
t=88400, 7.864702463150024s
t=88600, 7.7478508949279785s
t=88800, 7.5468480587005615s
t=89000, 7.81836462020874s
t=89200, 7.907296895980835s
t=89400, 8.044695138931274s
t=89600, 7.783050775527954s
t=89800, 9.121782064437866s
t=90000, 9.533104658126831s
t=90200, 8.39629054069519s
t=90400, 7.881438970565796s
t=90600, 8.02341103553772s
t=90800, 7.582085609436035s
t=91000, 7.788300037384033s
t=91200, 7.694584608078003s
t=91400, 8.582535982131958s
t=91600, 8.176561832427979s
t=91800, 7.969926357269287s
---------------------------------------
|             Epoch |              23 |
|      AverageEpRet |             689 |
|          StdEpRet |             431 |
|          MaxEpRet |        1.28e+03 |
|          MinEpRet |             267 |
|  AverageTestEpRet |             396 |
|      StdTestEpRet |             331 |
|      MaxTestEpRet |        1.04e+03 |
|      MinTestEpRet |            -335 |
|             EpLen |           1e+03 |


t=110600, 8.76269268989563s
t=110800, 8.1489417552948s
t=111000, 8.565202951431274s
t=111200, 8.142438888549805s
t=111400, 7.956962585449219s
t=111600, 8.398462772369385s
t=111800, 8.024202108383179s
---------------------------------------
|             Epoch |              28 |
|      AverageEpRet |             845 |
|          StdEpRet |             321 |
|          MaxEpRet |        1.27e+03 |
|          MinEpRet |             364 |
|  AverageTestEpRet |            -438 |
|      StdTestEpRet |             450 |
|      MaxTestEpRet |             462 |
|      MinTestEpRet |            -681 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.12e+05 |
|     AverageQ1Vals |           -14.3 |
|         StdQ1Vals |            0.48 |
|         MaxQ1Vals |           -13.2 |
|         MinQ1Vals |           -15.5 |
|     AverageQ2Vals |           -14.3 |
|         StdQ2Vals |           0.481 |
|         MaxQ2Vals |           -13.2 |


t=132000, 31.237499952316284s
t=132200, 7.962702035903931s
t=132400, 7.694420099258423s
t=132600, 8.620954513549805s
t=132800, 8.956858396530151s
t=133000, 8.883249759674072s
t=133200, 8.61119532585144s
t=133400, 8.130743026733398s
t=133600, 8.449960708618164s
t=133800, 7.921497344970703s
t=134000, 9.17932653427124s
t=134200, 7.99663782119751s
t=134400, 7.410190105438232s
t=134600, 7.77554726600647s
t=134800, 7.731328010559082s
t=135000, 7.491966247558594s
t=135200, 8.30632209777832s
t=135400, 8.298831939697266s
t=135600, 7.4111833572387695s
t=135800, 7.481988191604614s
---------------------------------------
|             Epoch |              34 |
|      AverageEpRet |            -263 |
|          StdEpRet |             262 |
|          MaxEpRet |             178 |
|          MinEpRet |            -505 |
|  AverageTestEpRet |            -292 |
|      StdTestEpRet |             206 |
|      MaxTestEpRet |             222 |
|      MinTestEpRet |            -442 |
|             EpLen |  

t=154000, 7.80508828163147s
t=154200, 7.445117235183716s
t=154400, 7.438110828399658s
t=154600, 7.386239528656006s
t=154800, 7.3164427280426025s
t=155000, 7.340375661849976s
t=155200, 7.658520698547363s
t=155400, 7.433122634887695s
t=155600, 7.237645864486694s
t=155800, 7.5369789600372314s
---------------------------------------
|             Epoch |              39 |
|      AverageEpRet |           -91.2 |
|          StdEpRet |             143 |
|          MaxEpRet |            80.8 |
|          MinEpRet |            -271 |
|  AverageTestEpRet |            -284 |
|      StdTestEpRet |            1.62 |
|      MaxTestEpRet |            -281 |
|      MinTestEpRet |            -286 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.56e+05 |
|     AverageQ1Vals |           -3.54 |
|         StdQ1Vals |           0.198 |
|         MaxQ1Vals |            -3.1 |
|         MinQ1Vals |           -4.09 |
|     AverageQ2Vals |        

---------------------------------------
|             Epoch |              44 |
|      AverageEpRet |            -500 |
|          StdEpRet |             140 |
|          MaxEpRet |            -282 |
|          MinEpRet |            -626 |
|  AverageTestEpRet |            -186 |
|      StdTestEpRet |             208 |
|      MaxTestEpRet |            41.8 |
|      MinTestEpRet |            -746 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.76e+05 |
|     AverageQ1Vals |           0.152 |
|         StdQ1Vals |           0.256 |
|         MaxQ1Vals |           0.695 |
|         MinQ1Vals |          -0.457 |
|     AverageQ2Vals |           0.152 |
|         StdQ2Vals |           0.255 |
|         MaxQ2Vals |           0.693 |
|         MinQ2Vals |          -0.458 |
|            LossPi |          -0.163 |
|             LossQ |            2.13 |
|              Time |            29.9 |
---------------------------------------


t=196000, 28.794963598251343s
t=196200, 7.024216175079346s
t=196400, 7.063140153884888s
t=196600, 6.965373516082764s
t=196800, 7.397224426269531s
t=197000, 6.9903013706207275s
t=197200, 6.9892823696136475s
t=197400, 7.1050286293029785s
t=197600, 7.26708197593689s
t=197800, 7.085052967071533s
t=198000, 6.962389230728149s
t=198200, 7.0292017459869385s
t=198400, 7.044163942337036s
t=198600, 6.989310026168823s
t=198800, 7.179798364639282s
t=199000, 7.110098123550415s
t=199200, 7.119959592819214s
t=199400, 7.337380647659302s
t=199600, 7.127937316894531s
t=199800, 7.025181770324707s
---------------------------------------
|             Epoch |              50 |
|      AverageEpRet |            -412 |
|          StdEpRet |             234 |
|          MaxEpRet |           -43.2 |
|          MinEpRet |            -645 |
|  AverageTestEpRet |            -385 |
|      StdTestEpRet |             158 |
|      MaxTestEpRet |             -66 |
|      MinTestEpRet |            -582 |
|             Ep

In [134]:
args = {'env': 'HalfCheetahMuJoCoEnv-v0', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'pre_feature_extraction_gated_lstm_td3_POMDP_HalfCheetahMuJoCoEnv_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\pre_feature_extraction_gated_lstm_td3_POMDP_HalfCheetahMuJoCoEnv_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze\pre_feature_extraction_gated_lstm_td3_POMDP_HalfCheetahMuJoCoEnv_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetahMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"pre_feature_extraction_gated_lstm_td3_POMDP_HalfCheetahMuJoCoEnv_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000014EF9FBBD08>":	{
           



[32;1m
Number of parameters: 	 pi: 334982, 	 q1: 335873, 	 q2: 335873
[0m
options= 
t=0, 0.0s
t=200, 0.16156959533691406s
t=400, 0.15558266639709473s
t=600, 0.1495988368988037s
t=800, 0.14960169792175293s
t=1000, 0.1556079387664795s
t=1200, 16.565677881240845s
t=1400, 16.82201910018921s
t=1600, 16.985579252243042s
t=1800, 17.019490480422974s
t=2000, 17.56604838371277s
t=2200, 17.485244512557983s
t=2400, 17.89315414428711s
t=2600, 17.80891251564026s
t=2800, 17.95003581047058s
t=3000, 17.793386459350586s
t=3200, 17.892156839370728s
t=3400, 18.4845712184906s
t=3600, 18.315027475357056s
t=3800, 20.104241132736206s
options= 
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -340 |
|          StdEpRet |            68.9 |
|          MaxEpRet |            -235 |
|          MinEpRet |            -428 |
|  AverageTestEpRet |            -394 |
|      StdTestEpRet |            11.5 |
|      MaxTestEpRet |            -381 |
|      Min

t=22200, 19.260498046875s
t=22400, 19.262491941452026s
t=22600, 19.575653791427612s
t=22800, 20.511154651641846s
t=23000, 19.025127410888672s
t=23200, 19.32033658027649s
t=23400, 19.06103014945984s
t=23600, 19.564685344696045s
t=23800, 19.924684286117554s
---------------------------------------
|             Epoch |               6 |
|      AverageEpRet |             -44 |
|          StdEpRet |            37.4 |
|          MaxEpRet |           -17.9 |
|          MinEpRet |            -109 |
|  AverageTestEpRet |           -26.1 |
|      StdTestEpRet |            29.5 |
|      MaxTestEpRet |            17.5 |
|      MinTestEpRet |           -86.1 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.4e+04 |
|     AverageQ1Vals |              14 |
|         StdQ1Vals |            9.51 |
|         MaxQ1Vals |            37.7 |
|         MinQ1Vals |           -5.53 |
|     AverageQ2Vals |              14 |
|         StdQ2Vals |   

t=44000, 106.83532166481018s
t=44200, 21.622180223464966s
t=44400, 23.240856409072876s
t=44600, 22.72822403907776s
t=44800, 21.064671516418457s
t=45000, 20.561376333236694s
t=45200, 22.410073041915894s
t=45400, 19.87285852432251s
t=45600, 19.351255416870117s
t=45800, 19.65145254135132s
t=46000, 19.457969427108765s
t=46200, 20.560023069381714s
t=46400, 19.985557794570923s
t=46600, 19.515815258026123s
t=46800, 20.319703102111816s
t=47000, 22.991522073745728s
t=47200, 20.161087036132812s
t=47400, 20.347590446472168s
t=47600, 20.399452447891235s
t=47800, 20.319665670394897s
---------------------------------------
|             Epoch |              12 |
|      AverageEpRet |             204 |
|          StdEpRet |            61.1 |
|          MaxEpRet |             269 |
|          MinEpRet |             118 |
|  AverageTestEpRet |             312 |
|      StdTestEpRet |             256 |
|      MaxTestEpRet |             557 |
|      MinTestEpRet |            -410 |
|             EpLen |  

t=66400, 21.592262983322144s
t=66600, 21.795717477798462s
t=66800, 21.684019088745117s
t=67000, 20.8711895942688s
t=67200, 20.03143548965454s
t=67400, 19.507835388183594s
t=67600, 19.932700395584106s
t=67800, 20.27980375289917s
---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |             673 |
|          StdEpRet |              24 |
|          MaxEpRet |             701 |
|          MinEpRet |             636 |
|  AverageTestEpRet |             590 |
|      StdTestEpRet |             449 |
|      MaxTestEpRet |             881 |
|      MinTestEpRet |            -738 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.8e+04 |
|     AverageQ1Vals |            18.8 |
|         StdQ1Vals |            8.74 |
|         MaxQ1Vals |            50.1 |
|         MinQ1Vals |           -13.4 |
|     AverageQ2Vals |            18.8 |
|         StdQ2Vals |            8.74 |
|         Ma

t=88000, 90.02128100395203s
t=88200, 19.25110960006714s
t=88400, 19.041083574295044s
t=88600, 19.102919101715088s
t=88800, 19.098931312561035s
t=89000, 19.235570430755615s
t=89200, 18.89546489715576s
t=89400, 19.308369159698486s
t=89600, 19.11987328529358s
t=89800, 18.943018436431885s
t=90000, 19.037095069885254s
t=90200, 19.835959672927856s
t=90400, 19.062026500701904s
t=90600, 18.97526240348816s
t=90800, 19.054048538208008s
t=91000, 19.04008674621582s
t=91200, 19.230575799942017s
t=91400, 18.958305597305298s
t=91600, 19.662937879562378s
t=91800, 19.565680503845215s
---------------------------------------
|             Epoch |              23 |
|      AverageEpRet |             762 |
|          StdEpRet |            76.5 |
|          MaxEpRet |             879 |
|          MinEpRet |             665 |
|  AverageTestEpRet |             607 |
|      StdTestEpRet |             337 |
|      MaxTestEpRet |             784 |
|      MinTestEpRet |            -394 |
|             EpLen |     

KeyboardInterrupt: 

In [133]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'pre_feature_extraction_gated_lstm_td3_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\pre_feature_extraction_gated_lstm_td3_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze\pre_feature_extraction_gated_lstm_td3_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetah-v2",
    "epochs":	50,
    "exp_name":	"pre_feature_extraction_gated_lstm_td3_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000014F1292EB88>":	{
            "epoch_dict":	{},
            "exp_




t=600, 0.04089069366455078s
t=800, 0.035902976989746094s
t=1000, 0.03091716766357422s
t=1200, 16.930727243423462s
t=1400, 16.67640733718872s
t=1600, 17.10625982284546s
t=1800, 16.874874591827393s
t=2000, 17.146151781082153s
t=2200, 17.38949990272522s
t=2400, 17.157121658325195s
t=2600, 17.317692279815674s
t=2800, 17.602930307388306s
t=3000, 17.618887662887573s
t=3200, 17.24987292289734s
t=3400, 17.71862030029297s
t=3600, 17.801400423049927s
t=3800, 18.463627815246582s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -196 |
|          StdEpRet |            51.2 |
|          MaxEpRet |            -137 |
|          MinEpRet |            -278 |
|  AverageTestEpRet |            -447 |
|      StdTestEpRet |              78 |
|      MaxTestEpRet |            -298 |
|      MinTestEpRet |            -576 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           4e+03 |
|     

t=23400, 18.20631504058838s
t=23600, 18.39581060409546s
t=23800, 19.111894130706787s
---------------------------------------
|             Epoch |               6 |
|      AverageEpRet |             907 |
|          StdEpRet |             160 |
|          MaxEpRet |        1.11e+03 |
|          MinEpRet |             680 |
|  AverageTestEpRet |             957 |
|      StdTestEpRet |            61.3 |
|      MaxTestEpRet |        1.09e+03 |
|      MinTestEpRet |             840 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.4e+04 |
|     AverageQ1Vals |            24.8 |
|         StdQ1Vals |            6.92 |
|         MaxQ1Vals |              40 |
|         MinQ1Vals |           -2.47 |
|     AverageQ2Vals |            24.8 |
|         StdQ2Vals |            6.93 |
|         MaxQ2Vals |            40.9 |
|         MinQ2Vals |           -2.91 |
|            LossPi |           -25.7 |
|             LossQ |           0.6

t=44000, 81.64172148704529s
t=44200, 18.50548481941223s
t=44400, 18.911431550979614s
t=44600, 18.636165857315063s
t=44800, 18.560370206832886s
t=45000, 18.540422439575195s
t=45200, 18.419745683670044s
t=45400, 18.361900329589844s
t=45600, 18.50751256942749s
t=45800, 18.44866681098938s
t=46000, 18.232247829437256s
t=46200, 18.38982629776001s
t=46400, 18.543412923812866s
t=46600, 18.354921340942383s
t=46800, 18.72791886329651s
t=47000, 18.882508277893066s
t=47200, 18.379852056503296s
t=47400, 18.29009246826172s
t=47600, 18.846606016159058s
t=47800, 18.89282727241516s
---------------------------------------
|             Epoch |              12 |
|      AverageEpRet |             376 |
|          StdEpRet |             840 |
|          MaxEpRet |        1.24e+03 |
|          MinEpRet |            -486 |
|  AverageTestEpRet |        1.31e+03 |
|      StdTestEpRet |            34.4 |
|      MaxTestEpRet |        1.38e+03 |
|      MinTestEpRet |        1.28e+03 |
|             EpLen |       

t=66400, 18.1953444480896s
t=66600, 18.348936319351196s
t=66800, 18.915419578552246s
t=67000, 18.91940927505493s
t=67200, 18.161436557769775s
t=67400, 18.508507251739502s
t=67600, 18.200333833694458s
t=67800, 18.474597215652466s
---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |        1.22e+03 |
|          StdEpRet |            33.4 |
|          MaxEpRet |        1.27e+03 |
|          MinEpRet |        1.18e+03 |
|  AverageTestEpRet |        1.27e+03 |
|      StdTestEpRet |            27.8 |
|      MaxTestEpRet |        1.31e+03 |
|      MinTestEpRet |        1.22e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.8e+04 |
|     AverageQ1Vals |            60.7 |
|         StdQ1Vals |             9.2 |
|         MaxQ1Vals |              78 |
|         MinQ1Vals |            8.95 |
|     AverageQ2Vals |            60.7 |
|         StdQ2Vals |            9.21 |
|         M

t=88000, 81.48710346221924s
t=88200, 18.354918003082275s
t=88400, 18.469611644744873s
t=88600, 18.461633920669556s
t=88800, 18.929381847381592s
t=89000, 18.516486883163452s
t=89200, 18.66808319091797s
t=89400, 18.500529289245605s
t=89600, 19.099925756454468s
t=89800, 18.099603414535522s
t=90000, 19.044075965881348s
t=90200, 19.300389289855957s
t=90400, 18.382843732833862s
t=90600, 18.433709621429443s
t=90800, 18.478587865829468s
t=91000, 18.21230125427246s
t=91200, 18.335967540740967s
t=91400, 18.49853515625s
t=91600, 18.1444833278656s
t=91800, 18.225264310836792s
---------------------------------------
|             Epoch |              23 |
|      AverageEpRet |        1.29e+03 |
|          StdEpRet |            10.5 |
|          MaxEpRet |        1.31e+03 |
|          MinEpRet |        1.28e+03 |
|  AverageTestEpRet |        1.19e+03 |
|      StdTestEpRet |            88.6 |
|      MaxTestEpRet |        1.44e+03 |
|      MinTestEpRet |        1.13e+03 |
|             EpLen |        

t=110000, 20.166075944900513s
t=110200, 20.882752418518066s
t=110400, 19.78309988975525s
t=110600, 20.086095809936523s
t=110800, 20.943213939666748s
t=111000, 19.70231580734253s
t=111200, 20.19300413131714s
t=111400, 20.47126007080078s
t=111600, 20.938477754592896s
t=111800, 22.5870361328125s
---------------------------------------
|             Epoch |              28 |
|      AverageEpRet |        1.93e+03 |
|          StdEpRet |            39.3 |
|          MaxEpRet |        1.98e+03 |
|          MinEpRet |        1.88e+03 |
|  AverageTestEpRet |           2e+03 |
|      StdTestEpRet |              33 |
|      MaxTestEpRet |        2.06e+03 |
|      MinTestEpRet |        1.96e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.12e+05 |
|     AverageQ1Vals |            77.1 |
|         StdQ1Vals |            8.48 |
|         MaxQ1Vals |            96.6 |
|         MinQ1Vals |            11.9 |
|     AverageQ2Vals |     

t=131600, 20.833292245864868s
t=131800, 19.948656797409058s
---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |        2.03e+03 |
|          StdEpRet |            39.6 |
|          MaxEpRet |        2.09e+03 |
|          MinEpRet |        1.98e+03 |
|  AverageTestEpRet |         2.2e+03 |
|      StdTestEpRet |            25.7 |
|      MaxTestEpRet |        2.23e+03 |
|      MinTestEpRet |        2.14e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.32e+05 |
|     AverageQ1Vals |            90.4 |
|         StdQ1Vals |            17.6 |
|         MaxQ1Vals |             123 |
|         MinQ1Vals |            14.9 |
|     AverageQ2Vals |            90.4 |
|         StdQ2Vals |            17.6 |
|         MaxQ2Vals |             123 |
|         MinQ2Vals |            15.5 |
|            LossPi |           -91.2 |
|             LossQ |            2.43 |
|              Time 

t=152000, 85.1805191040039s
t=152200, 19.378725290298462s
t=152400, 19.818650484085083s
t=152600, 22.103014707565308s
t=152800, 19.917470455169678s
t=153000, 19.96599793434143s
t=153200, 19.434957265853882s
t=153400, 19.477074146270752s
t=153600, 21.259429216384888s
t=153800, 20.525254726409912s
t=154000, 19.599496364593506s
t=154200, 19.644680500030518s
t=154400, 19.538530349731445s
t=154600, 19.606069087982178s
t=154800, 22.04862141609192s
t=155000, 22.191213846206665s
t=155200, 20.51792883872986s
t=155400, 20.577940940856934s
t=155600, 25.092573165893555s
t=155800, 26.10896611213684s
---------------------------------------
|             Epoch |              39 |
|      AverageEpRet |         2.4e+03 |
|          StdEpRet |            60.4 |
|          MaxEpRet |        2.47e+03 |
|          MinEpRet |        2.31e+03 |
|  AverageTestEpRet |        2.56e+03 |
|      StdTestEpRet |            28.7 |
|      MaxTestEpRet |         2.6e+03 |
|      MinTestEpRet |        2.48e+03 |
|     

t=173600, 21.502188205718994s
t=173800, 21.923033475875854s
t=174000, 20.514811038970947s
t=174200, 20.356048822402954s
t=174400, 21.240134477615356s
t=174600, 21.830864667892456s
t=174800, 22.371330976486206s
t=175000, 21.819201707839966s
t=175200, 21.304805040359497s
t=175400, 21.244550228118896s
t=175600, 20.678344011306763s
t=175800, 21.23548698425293s
---------------------------------------
|             Epoch |              44 |
|      AverageEpRet |        2.53e+03 |
|          StdEpRet |            67.2 |
|          MaxEpRet |        2.58e+03 |
|          MinEpRet |        2.42e+03 |
|  AverageTestEpRet |        2.73e+03 |
|      StdTestEpRet |            22.6 |
|      MaxTestEpRet |        2.77e+03 |
|      MinTestEpRet |        2.68e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.76e+05 |
|     AverageQ1Vals |             121 |
|         StdQ1Vals |            43.4 |
|         MaxQ1Vals |             174 |
|

t=195200, 20.09033727645874s
t=195400, 20.885432243347168s
t=195600, 19.58363389968872s
t=195800, 19.744203805923462s
---------------------------------------
|             Epoch |              49 |
|      AverageEpRet |        2.35e+03 |
|          StdEpRet |             806 |
|          MaxEpRet |        2.85e+03 |
|          MinEpRet |             957 |
|  AverageTestEpRet |        2.83e+03 |
|      StdTestEpRet |            63.5 |
|      MaxTestEpRet |        2.96e+03 |
|      MinTestEpRet |        2.75e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.96e+05 |
|     AverageQ1Vals |             147 |
|         StdQ1Vals |            41.9 |
|         MaxQ1Vals |             198 |
|         MinQ1Vals |            66.1 |
|     AverageQ2Vals |             147 |
|         StdQ2Vals |            41.9 |
|         MaxQ2Vals |             197 |
|         MinQ2Vals |            67.4 |
|            LossPi |            -149 |
| 

In [132]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': False,
        'freeze_hist_coding': False,
        'exp_name': 'pre_feature_extraction_gated_lstm_td3_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\pre_feature_extraction_gated_lstm_td3_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze\pre_feature_extraction_gated_lstm_td3_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetah-v2",
    "epochs":	50,
    "exp_name":	"pre_feature_extraction_gated_lstm_td3_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000014E9E6E5448>":	{
            "epoch_dict":	{},
            "exp_name":

t=12400, 18.039762258529663s
t=12600, 18.09165596961975s
t=12800, 18.721922636032104s
t=13000, 18.6680908203125s
t=13200, 18.096616506576538s
t=13400, 18.11156392097473s
t=13600, 18.09059476852417s
t=13800, 18.775822639465332s
t=14000, 18.1813542842865s
t=14200, 18.448416233062744s
t=14400, 18.370903253555298s
t=14600, 18.26516366004944s
t=14800, 18.08261489868164s
t=15000, 19.31238579750061s
t=15200, 19.026124954223633s
t=15400, 18.22723364830017s
t=15600, 18.303056955337524s
t=15800, 18.276130199432373s
---------------------------------------
|             Epoch |               4 |
|      AverageEpRet |             164 |
|          StdEpRet |             256 |
|          MaxEpRet |             474 |
|          MinEpRet |            -201 |
|  AverageTestEpRet |             438 |
|      StdTestEpRet |             145 |
|      MaxTestEpRet |             561 |
|      MinTestEpRet |             132 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEn

t=34800, 18.212299346923828s
t=35000, 18.04248309135437s
t=35200, 18.494545936584473s
t=35400, 18.909433603286743s
t=35600, 18.006880044937134s
t=35800, 18.049705743789673s
---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |        1.01e+03 |
|          StdEpRet |             778 |
|          MaxEpRet |        2.35e+03 |
|          MinEpRet |             496 |
|  AverageTestEpRet |        2.12e+03 |
|      StdTestEpRet |        1.07e+03 |
|      MaxTestEpRet |        3.02e+03 |
|      MinTestEpRet |             352 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.6e+04 |
|     AverageQ1Vals |            9.83 |
|         StdQ1Vals |            23.4 |
|         MaxQ1Vals |             128 |
|         MinQ1Vals |           -51.1 |
|     AverageQ2Vals |            9.83 |
|         StdQ2Vals |            23.5 |
|         MaxQ2Vals |             122 |
|         MinQ2Vals |      

t=56000, 77.18059015274048s
t=56200, 17.645817041397095s
t=56400, 17.99709963798523s
t=56600, 17.78693127632141s
t=56800, 17.5989727973938s
t=57000, 18.02177929878235s
t=57200, 19.30238389968872s
t=57400, 18.146477937698364s
t=57600, 17.75055241584778s
t=57800, 17.639811754226685s
t=58000, 17.762532472610474s
t=58200, 17.673738956451416s
t=58400, 17.603899240493774s
t=58600, 17.803393602371216s
t=58800, 17.675734281539917s
t=59000, 19.127854108810425s
t=59200, 18.477591037750244s
t=59400, 17.786457777023315s
t=59600, 17.747522354125977s
t=59800, 17.41842484474182s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |        2.52e+03 |
|          StdEpRet |             736 |
|          MaxEpRet |           3e+03 |
|          MinEpRet |        1.25e+03 |
|  AverageTestEpRet |        2.64e+03 |
|      StdTestEpRet |             888 |
|      MaxTestEpRet |        3.38e+03 |
|      MinTestEpRet |             937 |
|             EpLen |        

t=78400, 18.0188467502594s
t=78600, 17.917060613632202s
t=78800, 18.46961236000061s
t=79000, 18.499563694000244s
t=79200, 18.345913410186768s
t=79400, 21.958721160888672s
t=79600, 19.334299087524414s
t=79800, 18.359904527664185s
---------------------------------------
|             Epoch |              20 |
|      AverageEpRet |        3.03e+03 |
|          StdEpRet |            67.8 |
|          MaxEpRet |         3.1e+03 |
|          MinEpRet |        2.92e+03 |
|  AverageTestEpRet |           3e+03 |
|      StdTestEpRet |             621 |
|      MaxTestEpRet |        3.31e+03 |
|      MinTestEpRet |        1.15e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           8e+04 |
|     AverageQ1Vals |            83.4 |
|         StdQ1Vals |            78.8 |
|         MaxQ1Vals |             193 |
|         MinQ1Vals |           -31.9 |
|     AverageQ2Vals |            83.4 |
|         StdQ2Vals |            78.8 |
|         M

t=100000, 81.15699243545532s
t=100200, 20.81434440612793s
t=100400, 20.536085605621338s
t=100600, 20.108229637145996s
t=100800, 20.229905605316162s
t=101000, 20.413414001464844s
t=101200, 20.34260392189026s
t=101400, 20.343602657318115s
t=101600, 20.457295656204224s
t=101800, 20.9220552444458s
t=102000, 20.564010858535767s
t=102200, 20.602907419204712s
t=102400, 20.56101965904236s
t=102600, 20.323655366897583s
t=102800, 20.36554217338562s
t=103000, 20.633825540542603s
t=103200, 20.391472578048706s
t=103400, 20.70064687728882s
t=103600, 20.34260368347168s
t=103800, 20.31368350982666s
---------------------------------------
|             Epoch |              26 |
|      AverageEpRet |        2.76e+03 |
|          StdEpRet |             488 |
|          MaxEpRet |        3.17e+03 |
|          MinEpRet |        1.93e+03 |
|  AverageTestEpRet |        3.13e+03 |
|      StdTestEpRet |            52.2 |
|      MaxTestEpRet |         3.2e+03 |
|      MinTestEpRet |        3.06e+03 |
|         

t=121600, 17.94102692604065s
t=121800, 18.06668972969055s
t=122000, 18.17530584335327s
t=122200, 18.020819187164307s
t=122400, 18.14348268508911s
t=122600, 18.086602926254272s
t=122800, 18.209340572357178s
t=123000, 19.085960865020752s
t=123200, 18.208285570144653s
t=123400, 18.159438848495483s
t=123600, 18.288097858428955s
t=123800, 17.797441005706787s
---------------------------------------
|             Epoch |              31 |
|      AverageEpRet |        2.82e+03 |
|          StdEpRet |             467 |
|          MaxEpRet |        3.14e+03 |
|          MinEpRet |        2.01e+03 |
|  AverageTestEpRet |        3.07e+03 |
|      StdTestEpRet |             391 |
|      MaxTestEpRet |        3.33e+03 |
|      MinTestEpRet |        1.92e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.24e+05 |
|     AverageQ1Vals |             110 |
|         StdQ1Vals |              76 |
|         MaxQ1Vals |             193 |
|   

t=143200, 18.227233171463013s
t=143400, 18.46764850616455s
t=143600, 18.070674657821655s
t=143800, 18.18334937095642s
---------------------------------------
|             Epoch |              36 |
|      AverageEpRet |        2.72e+03 |
|          StdEpRet |             223 |
|          MaxEpRet |        2.96e+03 |
|          MinEpRet |        2.46e+03 |
|  AverageTestEpRet |        3.23e+03 |
|      StdTestEpRet |            70.6 |
|      MaxTestEpRet |        3.35e+03 |
|      MinTestEpRet |        3.12e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.44e+05 |
|     AverageQ1Vals |             107 |
|         StdQ1Vals |            68.5 |
|         MaxQ1Vals |             179 |
|         MinQ1Vals |           -14.3 |
|     AverageQ2Vals |             107 |
|         StdQ2Vals |            68.5 |
|         MaxQ2Vals |             179 |
|         MinQ2Vals |           -14.5 |
|            LossPi |            -108 |
| 

t=164000, 78.83519148826599s
t=164200, 18.111541509628296s
t=164400, 18.127527952194214s
t=164600, 17.954020500183105s
t=164800, 18.226233959197998s
t=165000, 18.743897914886475s
t=165200, 18.222283124923706s
t=165400, 18.38281488418579s
t=165600, 18.56535577774048s
t=165800, 18.02582621574402s
t=166000, 18.247178554534912s
t=166200, 17.9051513671875s
t=166400, 17.988868713378906s
t=166600, 18.157475471496582s
t=166800, 18.103586196899414s
t=167000, 18.106558799743652s
t=167200, 18.257179737091064s
t=167400, 18.156481981277466s
t=167600, 18.315022706985474s
t=167800, 18.370848417282104s
---------------------------------------
|             Epoch |              42 |
|      AverageEpRet |        1.85e+03 |
|          StdEpRet |        1.12e+03 |
|          MaxEpRet |        3.11e+03 |
|          MinEpRet |             648 |
|  AverageTestEpRet |        2.98e+03 |
|      StdTestEpRet |             821 |
|      MaxTestEpRet |        3.33e+03 |
|      MinTestEpRet |             525 |
|     

t=185600, 16.358460426330566s
t=185800, 16.991936206817627s
t=186000, 17.21464204788208s
t=186200, 17.505627870559692s
t=186400, 16.33229398727417s
t=186600, 16.877866506576538s
t=186800, 16.721287488937378s
t=187000, 16.61357593536377s
t=187200, 16.58864188194275s
t=187400, 16.469319581985474s
t=187600, 17.125210285186768s
t=187800, 16.586678504943848s
---------------------------------------
|             Epoch |              47 |
|      AverageEpRet |        2.06e+03 |
|          StdEpRet |             912 |
|          MaxEpRet |         3.1e+03 |
|          MinEpRet |             598 |
|  AverageTestEpRet |        3.27e+03 |
|      StdTestEpRet |            55.4 |
|      MaxTestEpRet |        3.32e+03 |
|      MinTestEpRet |        3.12e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.88e+05 |
|     AverageQ1Vals |             119 |
|         StdQ1Vals |            67.4 |
|         MaxQ1Vals |             186 |
|   

In [123]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': False,
        'freeze_hist_coding': False,
        'exp_name': 'gated_lstm_td3_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\gated_lstm_td3_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze\gated_lstm_td3_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetah-v2",
    "epochs":	50,
    "exp_name":	"gated_lstm_td3_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000014EFA8DF5C8>":	{
            "epoch_dict":	{},
            "exp_name":	"gated_lstm_td3_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L12



t=200, 0.06981348991394043s
t=400, 0.07081055641174316s
t=600, 0.0688166618347168s
t=800, 0.06682038307189941s
t=1000, 0.058844566345214844s
t=1200, 29978.925325393677s
t=1400, 21.72989320755005s
t=1600, 19.312796592712402s
t=1800, 14.932295560836792s
t=2000, 13.752226829528809s
t=2200, 13.480950593948364s
t=2400, 14.195046424865723s
t=2600, 15.154473304748535s
t=2800, 14.67147445678711s
t=3000, 13.999565601348877s
t=3200, 14.179250955581665s
t=3400, 14.267359972000122s
t=3600, 14.551057815551758s
t=3800, 14.402520656585693s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -497 |
|          StdEpRet |            90.2 |
|          MaxEpRet |            -341 |
|          MinEpRet |            -550 |
|  AverageTestEpRet |            -593 |
|      StdTestEpRet |            1.22 |
|      MaxTestEpRet |            -591 |
|      MinTestEpRet |            -595 |
|             EpLen |           1e+03 |
|         TestEpLen |        

t=22800, 18.181387901306152s
t=23000, 17.62782907485962s
t=23200, 19.62951350212097s
t=23400, 19.214617490768433s
t=23600, 16.358288049697876s
t=23800, 16.472955465316772s
---------------------------------------
|             Epoch |               6 |
|      AverageEpRet |            -549 |
|          StdEpRet |            2.29 |
|          MaxEpRet |            -547 |
|          MinEpRet |            -553 |
|  AverageTestEpRet |            -594 |
|      StdTestEpRet |            1.31 |
|      MaxTestEpRet |            -592 |
|      MinTestEpRet |            -596 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.4e+04 |
|     AverageQ1Vals |           -19.9 |
|         StdQ1Vals |            1.82 |
|         MaxQ1Vals |           0.629 |
|         MinQ1Vals |           -45.8 |
|     AverageQ2Vals |           -19.9 |
|         StdQ2Vals |            1.82 |
|         MaxQ2Vals |            1.48 |
|         MinQ2Vals |       

t=44000, 72.82925391197205s
t=44200, 16.46101713180542s
t=44400, 16.148784160614014s
t=44600, 16.004204750061035s
t=44800, 16.006200551986694s
t=45000, 16.16577172279358s
t=45200, 16.182754516601562s
t=45400, 16.215611219406128s
t=45600, 16.404163599014282s
t=45800, 17.598938941955566s
t=46000, 16.311357498168945s
t=46200, 16.094961643218994s
t=46400, 16.10094380378723s
t=46600, 16.05606770515442s
t=46800, 16.26650381088257s
t=47000, 16.10393714904785s
t=47200, 16.184722661972046s
t=47400, 16.28345799446106s
t=47600, 16.48392081260681s
t=47800, 16.669426441192627s
---------------------------------------
|             Epoch |              12 |
|      AverageEpRet |            -549 |
|          StdEpRet |            1.24 |
|          MaxEpRet |            -547 |
|          MinEpRet |            -550 |
|  AverageTestEpRet |            -593 |
|      StdTestEpRet |            1.28 |
|      MaxTestEpRet |            -592 |
|      MinTestEpRet |            -595 |
|             EpLen |        

t=66400, 16.3526451587677s
t=66600, 16.347285747528076s
t=66800, 16.174752473831177s
t=67000, 16.328307628631592s
t=67200, 16.55402636528015s
t=67400, 15.989245891571045s
t=67600, 16.513840913772583s
t=67800, 16.431090593338013s
---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |            -548 |
|          StdEpRet |            1.21 |
|          MaxEpRet |            -546 |
|          MinEpRet |            -550 |
|  AverageTestEpRet |            -593 |
|      StdTestEpRet |            1.22 |
|      MaxTestEpRet |            -591 |
|      MinTestEpRet |            -595 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.8e+04 |
|     AverageQ1Vals |           -40.2 |
|         StdQ1Vals |            4.75 |
|         MaxQ1Vals |           -30.9 |
|         MinQ1Vals |            -126 |
|     AverageQ2Vals |           -40.2 |
|         StdQ2Vals |            4.75 |
|         M

t=88000, 74.08988499641418s
t=88200, 16.56769609451294s
t=88400, 16.40613031387329s
t=88600, 16.464973211288452s
t=88800, 16.32733964920044s
t=89000, 16.24259662628174s
t=89200, 16.531798362731934s
t=89400, 17.11566925048828s
t=89600, 16.420092582702637s
t=89800, 16.2436203956604s
t=90000, 16.321383476257324s
t=90200, 16.534759521484375s
t=90400, 16.454999446868896s
t=90600, 16.278470754623413s
t=90800, 16.236583709716797s
t=91000, 16.34531855583191s
t=91200, 16.917736053466797s
t=91400, 16.095957040786743s
t=91600, 16.37122654914856s
t=91800, 16.55871891975403s
---------------------------------------
|             Epoch |              23 |
|      AverageEpRet |            -547 |
|          StdEpRet |           0.283 |
|          MaxEpRet |            -547 |
|          MinEpRet |            -548 |
|  AverageTestEpRet |            -593 |
|      StdTestEpRet |           0.713 |
|      MaxTestEpRet |            -592 |
|      MinTestEpRet |            -594 |
|             EpLen |          

t=110200, 16.52581000328064s
t=110400, 16.3881778717041s
t=110600, 16.397155046463013s
t=110800, 16.684385061264038s
t=111000, 16.859917402267456s
t=111200, 17.303727626800537s
t=111400, 17.66077733039856s
t=111600, 16.859915733337402s
t=111800, 16.364242553710938s
---------------------------------------
|             Epoch |              28 |
|      AverageEpRet |            -550 |
|          StdEpRet |           0.612 |
|          MaxEpRet |            -549 |
|          MinEpRet |            -550 |
|  AverageTestEpRet |            -593 |
|      StdTestEpRet |           0.882 |
|      MaxTestEpRet |            -592 |
|      MinTestEpRet |            -595 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.12e+05 |
|     AverageQ1Vals |           -47.4 |
|         StdQ1Vals |            6.87 |
|         MaxQ1Vals |           -40.4 |
|         MinQ1Vals |            -171 |
|     AverageQ2Vals |           -47.4 |
|         StdQ

t=131800, 16.46696949005127s
---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |            -549 |
|          StdEpRet |            1.09 |
|          MaxEpRet |            -548 |
|          MinEpRet |            -551 |
|  AverageTestEpRet |            -594 |
|      StdTestEpRet |            0.76 |
|      MaxTestEpRet |            -593 |
|      MinTestEpRet |            -596 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.32e+05 |
|     AverageQ1Vals |           -49.8 |
|         StdQ1Vals |            9.24 |
|         MaxQ1Vals |           -42.3 |
|         MinQ1Vals |            -231 |
|     AverageQ2Vals |           -49.8 |
|         StdQ2Vals |            9.24 |
|         MaxQ2Vals |           -41.9 |
|         MinQ2Vals |            -231 |
|            LossPi |            49.8 |
|             LossQ |           0.868 |
|              Time |            74.2 |
-----------

t=152000, 73.64307808876038s
t=152200, 16.4559965133667s
t=152400, 16.195722818374634s
t=152600, 16.329303979873657s
t=152800, 16.47694158554077s
t=153000, 16.36627221107483s
t=153200, 16.174713134765625s
t=153400, 16.244560718536377s
t=153600, 16.289469242095947s
t=153800, 16.955634832382202s
t=154000, 16.543792486190796s
t=154200, 16.27445077896118s
t=154400, 16.373218297958374s
t=154600, 16.472951412200928s
t=154800, 16.24359655380249s
t=155000, 16.425047159194946s
t=155200, 16.46896266937256s
t=155400, 17.13817000389099s
t=155600, 16.390174388885498s
t=155800, 16.495890855789185s
---------------------------------------
|             Epoch |              39 |
|      AverageEpRet |            -548 |
|          StdEpRet |           0.987 |
|          MaxEpRet |            -547 |
|          MinEpRet |            -549 |
|  AverageTestEpRet |            -593 |
|      StdTestEpRet |           0.809 |
|      MaxTestEpRet |            -592 |
|      MinTestEpRet |            -595 |
|        

t=173800, 16.157785654067993s
t=174000, 16.48597025871277s
t=174200, 16.176743984222412s
t=174400, 16.11391043663025s
t=174600, 16.443031072616577s
t=174800, 16.457022666931152s
t=175000, 16.403108835220337s
t=175200, 16.635544776916504s
t=175400, 16.429042100906372s
t=175600, 16.375238180160522s
t=175800, 16.323353052139282s
---------------------------------------
|             Epoch |              44 |
|      AverageEpRet |            -548 |
|          StdEpRet |            1.91 |
|          MaxEpRet |            -545 |
|          MinEpRet |            -551 |
|  AverageTestEpRet |            -593 |
|      StdTestEpRet |            1.17 |
|      MaxTestEpRet |            -592 |
|      MinTestEpRet |            -595 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.76e+05 |
|     AverageQ1Vals |           -51.9 |
|         StdQ1Vals |            7.03 |
|         MaxQ1Vals |           -42.8 |
|         MinQ1Vals |           

t=195400, 16.222654104232788s
t=195600, 16.246553659439087s
t=195800, 16.383161544799805s
---------------------------------------
|             Epoch |              49 |
|      AverageEpRet |            -549 |
|          StdEpRet |            1.21 |
|          MaxEpRet |            -547 |
|          MinEpRet |            -551 |
|  AverageTestEpRet |            -593 |
|      StdTestEpRet |            1.15 |
|      MaxTestEpRet |            -592 |
|      MinTestEpRet |            -595 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.96e+05 |
|     AverageQ1Vals |           -52.4 |
|         StdQ1Vals |            6.36 |
|         MaxQ1Vals |           -44.1 |
|         MinQ1Vals |            -262 |
|     AverageQ2Vals |           -52.4 |
|         StdQ2Vals |            6.36 |
|         MaxQ2Vals |           -42.4 |
|         MinQ2Vals |            -264 |
|            LossPi |            52.4 |
|             LossQ |         

In [122]:
args = {'env': 'HalfCheetahMuJoCoEnv-v0', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': False,
        'freeze_hist_coding': False,
        'exp_name': 'gated_lstm_td3_MDP_HalfCheetahMuJoCo_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\gated_lstm_td3_MDP_HalfCheetahMuJoCo_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze\gated_lstm_td3_MDP_HalfCheetahMuJoCo_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetahMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"gated_lstm_td3_MDP_HalfCheetahMuJoCo_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000014EFA57A808>":	{
            "epoch_dict":	{},
            "exp_name":	"gated_lstm_td3_MDP_HalfCheetahMuJoCo_Pre

t=13400, 18.390820264816284s
t=13600, 18.188363790512085s
t=13800, 18.153459548950195s
t=14000, 18.492582321166992s
t=14200, 18.223237991333008s
t=14400, 18.279123306274414s
t=14600, 18.545438766479492s
t=14800, 18.44564652442932s
t=15000, 18.086663961410522s
t=15200, 18.096580266952515s
t=15400, 18.388856887817383s
t=15600, 18.33594059944153s
t=15800, 18.064696073532104s
---------------------------------------
|             Epoch |               4 |
|      AverageEpRet |            -541 |
|          StdEpRet |            17.9 |
|          MaxEpRet |            -514 |
|          MinEpRet |            -562 |
|  AverageTestEpRet |            -599 |
|      StdTestEpRet |              13 |
|      MaxTestEpRet |            -578 |
|      MinTestEpRet |            -627 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |           -14.5 |
|         StdQ1Vals |            6.16 |
|         MaxQ1Vals |    

t=35800, 18.236237287521362s
---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |            -553 |
|          StdEpRet |            3.49 |
|          MaxEpRet |            -548 |
|          MinEpRet |            -557 |
|  AverageTestEpRet |            -599 |
|      StdTestEpRet |              14 |
|      MaxTestEpRet |            -575 |
|      MinTestEpRet |            -625 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.6e+04 |
|     AverageQ1Vals |           -29.1 |
|         StdQ1Vals |             4.6 |
|         MaxQ1Vals |           -1.65 |
|         MinQ1Vals |           -89.8 |
|     AverageQ2Vals |           -29.1 |
|         StdQ2Vals |             4.6 |
|         MaxQ2Vals |            -2.2 |
|         MinQ2Vals |           -89.8 |
|            LossPi |            29.2 |
|             LossQ |           0.275 |
|              Time |            95.3 |
-----------

KeyboardInterrupt: 

## Without World Model

In [90]:
def td3(env_name, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 
        steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 
        polyak=0.995, pi_lr=1e-3, q_lr=1e-3, 
        batch_size=100, max_hist_len=100,
        start_steps=1000, 
        update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, 
        noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, 
        nonstationary_env = True,
        gravity_change_pattern = 'gravity_averagely_equal',
        partially_observable = False,
        freeze_hist_coding = False,
        logger_kwargs=dict(), save_freq=1):
    """
    Twin Delayed Deep Deterministic Policy Gradient (TD3)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of 
            observations as inputs, and ``q1`` and ``q2`` should accept a batch 
            of observations and a batch of actions as inputs. When called, 
            these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current 
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target 
            policy.

        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.

        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    # Wrapper environment if using POMDP
    if partially_observable == True:
        env, test_env = POMDPWrapper(env_name), POMDPWrapper(env_name)
    else:
        env, test_env = gym.make(env_name), gym.make(env_name)
    obs_dim = env.observation_space.shape[0]   
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
#     ac = MLPActorCritic(obs_dim, act_dim, hid_state_layer_sizes=(),
#                         critic_hidden_sizes=(128, 128), actor_hidden_sizes=(128,128),
#                         share_state_net=True, no_history_memory=False)
    ac = MLPActorCritic(obs_dim, act_dim, act_limit,
                        critic_pre_lstm_hid_sizes=(128,), actor_pre_lstm_hid_sizes=(128,),
                        critic_hidden_sizes=(128, 128), actor_hidden_sizes=(128, 128),
                        lstm_hid_dim=128, lstm_hid_lay_num=2)
#     ac = MLPActorCriticRandomHist(obs_dim, act_dim, act_limit, 
#                                   critic_hidden_sizes=(128, 128), actor_hidden_sizes=(128, 128))
    ac_targ = deepcopy(ac)
    ac.cuda()
    ac_targ.cuda()
    
#     # Freeze hist coding
#     if freeze_hist_coding:
#         ac.q1.layers[0].requires_grad=False
#         ac.q2.layers[0].requires_grad=False
#         ac.pi.layers[0].requires_grad=False

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False
        
    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())
    
    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, max_size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n'%var_counts)

    # Set up function for computing TD3 Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']
        h_o, h_a, h_o2, h_a2, h_len = data['hist_obs'], data['hist_act'], data['hist_obs2'], data['hist_act2'], data['hist_len']

        q1 = ac.q1(o, a, h_o, h_a, h_len)
        q2 = ac.q2(o, a, h_o, h_a, h_len)
#         q1 = ac.q1(o, a)
#         q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            pi_targ = ac_targ.pi(o2, h_o2, h_a2, h_len)
#             pi_targ = ac_targ.pi(o2)
            
            # Target policy smoothing
            epsilon = torch.randn_like(pi_targ) * target_noise
            epsilon = torch.clamp(epsilon, -noise_clip, noise_clip)
            a2 = pi_targ + epsilon
            a2 = torch.clamp(a2, -act_limit, act_limit)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2, h_o2, h_a2, h_len)
            q2_pi_targ = ac_targ.q2(o2, a2, h_o2, h_a2, h_len)
#             q1_pi_targ = ac_targ.q1(o2, a2)
#             q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * q_pi_targ

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        loss_info = dict(Q1Vals=q1.detach().cpu().numpy(),
                         Q2Vals=q2.detach().cpu().numpy())

        return loss_q, loss_info

    # Set up function for computing TD3 pi loss
    def compute_loss_pi(data):
        o, h_o, h_a, h_len = data['obs'], data['hist_obs'], data['hist_act'], data['hist_len']
        q1_pi = ac.q1(o, ac.pi(o, h_o, h_a, h_len), h_o, h_a, h_len)
#         q1_pi = ac.q1(o, ac.pi(o))
        return -q1_pi.mean()

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    q_optimizer = Adam(q_params, lr=q_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data, timer):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **loss_info)

        # Possibly update pi and target networks
        if timer % policy_delay == 0:

#             # Freeze Q-networks so you don't waste computational effort 
#             # computing gradients for them during the policy learning step.
#             for p in q_params:
#                 p.requires_grad = False

            # Next run one gradient descent step for pi.
            pi_optimizer.zero_grad()
            loss_pi = compute_loss_pi(data)
            loss_pi.backward()
            pi_optimizer.step()

#             # Unfreeze Q-networks so you can optimize it at next DDPG step.
#             for p in q_params:
#                 p.requires_grad = True

            # Record things
            logger.store(LossPi=loss_pi.item())

            # Finally, update target networks by polyak averaging.
            with torch.no_grad():
                for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                    # NB: We use an in-place operations "mul_", "add_" to update target
                    # params, as opposed to "mul" and "add", which would make new tensors.
                    p_targ.data.mul_(polyak)
                    p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, o_buff, a_buff, o_buff_len, noise_scale):
        h_o = torch.tensor(o_buff).view(1, o_buff.shape[0], o_buff.shape[1]).float().cuda()
        h_a = torch.tensor(a_buff).view(1, a_buff.shape[0], a_buff.shape[1]).float().cuda()
        h_l = torch.tensor([o_buff_len]).float().cuda()
        with torch.no_grad(): 
            a = ac.act(torch.as_tensor(o, dtype=torch.float32).view(1,-1).cuda(),
                       h_o, h_a, h_l).reshape(act_dim)
        a += noise_scale * np.random.randn(act_dim)
        if a.shape[0]!=act_dim:
            import pdb
            pdb.set_trace()
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
                
            if max_hist_len>0:
                o_buff = np.zeros([max_hist_len, obs_dim])
                a_buff = np.zeros([max_hist_len, act_dim])
                o_buff[0,:] = o
                o_buff_len = 0
            else:
                o_buff = np.zeros([1, obs_dim])
                a_buff = np.zeros([1, act_dim])
                o_buff_len = 0
                
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                a = get_action(o, o_buff, a_buff, o_buff_len, 0)
                o2, r, d, _ = test_env.step(a)
                
                ep_ret += r
                ep_len += 1
                # Add short history
                if max_hist_len != 0:
                    if o_buff_len == max_hist_len:
                        o_buff[:max_hist_len-1] = o_buff[1:]
                        a_buff[:max_hist_len-1] = a_buff[1:]
                        o_buff[max_hist_len-1] = list(o)
                        a_buff[max_hist_len-1] = list(a)
                    else:
                        if a.shape[0]!=act_dim:
                            import pdb
                            pdb.set_trace()
                        o_buff[o_buff_len+1-1] = list(o)
                        a_buff[o_buff_len+1-1] = list(a)
                        o_buff_len += 1
                o = o2
                
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
        
    if max_hist_len>0:
        o_buff = np.zeros([max_hist_len, obs_dim])
        a_buff = np.zeros([max_hist_len, act_dim])
        o_buff[0,:] = o
        o_buff_len = 0
    else:
        o_buff = np.zeros([1, obs_dim])
        a_buff = np.zeros([1, act_dim])
        o_buff_len = 0

    # Main loop: collect experience in env and update/log each epoch
    start_time = time.time()
    for t in range(total_steps):
        if t%200 == 0:
            end_time = time.time()
            print("t={}, {}s".format(t, end_time-start_time))
            start_time = end_time
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards, 
        # use the learned policy (with some noise, via act_noise). 
        if t > start_steps:
            a = get_action(o, o_buff, a_buff, o_buff_len, act_noise)
        else:
            a = env.action_space.sample()
        
        if nonstationary_env == True:
            gravity_cycle = 1000
            gravity_base = -9.81
            if gravity_change_pattern == 'gravity_averagely_equal':
                gravity = gravity_base * 1 / 2 * (np.cos(2 * np.pi / gravity_cycle * t) + 1) + gravity_base / 2
            elif gravity_change_pattern == 'gravity_averagely_easier':
                gravity = gravity_base * 1 / 2 * (np.cos(2 * np.pi / gravity_cycle * t) + 1)
            elif gravity_change_pattern == 'gravity_averagely_harder':
                gravity = gravity_base * 1 / 2 * (-np.cos(2 * np.pi / gravity_cycle * t) + 1) + gravity_base
            else:
                pass

            if 'PyBulletEnv' in env_name:
                env.env._p.setGravity(0, 0, gravity)
            elif 'Roboschool' in env_name:
                pass
            else:
                env.model.opt.gravity[2] = gravity
        
        # Step the env
        o2, r, d, _ = env.step(a)
            
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)
        
        # Add short history
        if max_hist_len != 0:
            if o_buff_len == max_hist_len:
                o_buff[:max_hist_len-1] = o_buff[1:]
                a_buff[:max_hist_len-1] = a_buff[1:]
                o_buff[max_hist_len-1] = list(o)
                a_buff[max_hist_len-1] = list(a)
            else:
                if a.shape[0]!=act_dim:
                    import pdb
                    pdb.set_trace()
                o_buff[o_buff_len+1-1] = list(o)
                a_buff[o_buff_len+1-1] = list(a)
                o_buff_len += 1
        
        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2
        
        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0
                
            if max_hist_len>0:
                o_buff = np.zeros([max_hist_len, obs_dim])
                a_buff = np.zeros([max_hist_len, act_dim])
                o_buff[0,:] = o
                o_buff_len = 0
            else:
                o_buff = np.zeros([1, obs_dim])
                a_buff = np.zeros([1, act_dim])
                o_buff_len = 0
        
        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch_with_history(batch_size, max_hist_len)
                batch = {k: v.cuda() for k,v in batch.items()}
                update(data=batch, timer=j)

        # End of epoch handling
        if (t+1) % steps_per_epoch == 0:
            epoch = (t+1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()

In [93]:
args = {'env': 'HalfCheetahMuJoCoEnv-v0', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': False,
        'freeze_hist_coding': False,
        'exp_name': 'lstm_td3_MDP_HalfCheetahMuJoCo_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_MDP_HalfCheetahMuJoCo_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze\lstm_td3_MDP_HalfCheetahMuJoCo_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetahMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"lstm_td3_MDP_HalfCheetahMuJoCo_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000014F1295C688>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_MDP_HalfCheetahMuJoCo_PreLSTM1L128_HistMemory5Len



t=200, 0.19347929954528809s
t=400, 0.1541736125946045s
t=600, 0.13817453384399414s
t=800, 0.176530122756958s
t=1000, 0.14560937881469727s
t=1200, 16.76691508293152s
t=1400, 16.12334680557251s
t=1600, 16.870543241500854s
t=1800, 19.231250286102295s
t=2000, 17.220842838287354s
t=2200, 17.24915862083435s
t=2400, 18.403258323669434s
t=2600, 19.75970458984375s
t=2800, 16.711970567703247s
t=3000, 16.288362503051758s
t=3200, 16.61807107925415s
t=3400, 16.679887056350708s
t=3600, 16.457354068756104s
t=3800, 17.601354360580444s
options= 
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -463 |
|          StdEpRet |             135 |
|          MaxEpRet |            -229 |
|          MinEpRet |            -551 |
|  AverageTestEpRet |            -584 |
|      StdTestEpRet |              28 |
|      MaxTestEpRet |            -560 |
|      MinTestEpRet |            -666 |
|             EpLen |           1e+03 |
|         TestEpLen |    

t=22800, 17.22262215614319s
t=23000, 16.932042598724365s
t=23200, 17.06633687019348s
t=23400, 17.007333278656006s
t=23600, 17.869902849197388s
t=23800, 18.172602653503418s
---------------------------------------
|             Epoch |               6 |
|      AverageEpRet |            -535 |
|          StdEpRet |            1.89 |
|          MaxEpRet |            -533 |
|          MinEpRet |            -537 |
|  AverageTestEpRet |            -573 |
|      StdTestEpRet |            11.1 |
|      MaxTestEpRet |            -541 |
|      MinTestEpRet |            -585 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.4e+04 |
|     AverageQ1Vals |           -17.4 |
|         StdQ1Vals |            3.35 |
|         MaxQ1Vals |            6.68 |
|         MinQ1Vals |           -26.6 |
|     AverageQ2Vals |           -17.4 |
|         StdQ2Vals |            3.35 |
|         MaxQ2Vals |            7.06 |
|         MinQ2Vals |       

KeyboardInterrupt: 

In [91]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': False,
        'freeze_hist_coding': False,
        'exp_name': 'lstm_td3_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze\lstm_td3_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetah-v2",
    "epochs":	50,
    "exp_name":	"lstm_td3_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000014F12985288>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_N



[32;1m
Number of parameters: 	 pi: 303238, 	 q1: 303361, 	 q2: 303361
[0m
t=0, 0.0s
t=200, 0.03194165229797363s
t=400, 0.02590322494506836s
t=600, 0.027926921844482422s
t=800, 0.03191423416137695s
t=1000, 0.044878482818603516s
t=1200, 15.171576738357544s
t=1400, 13.395674705505371s
t=1600, 13.220648527145386s
t=1800, 13.439063310623169s
t=2000, 13.823039054870605s
t=2200, 13.789126634597778s
t=2400, 13.84099006652832s
t=2600, 13.922769784927368s
t=2800, 14.157143592834473s
t=3000, 13.880881547927856s
t=3200, 14.144179105758667s
t=3400, 14.064392328262329s
t=3600, 14.080349445343018s
t=3800, 14.749558925628662s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -508 |
|          StdEpRet |            75.9 |
|          MaxEpRet |            -376 |
|          MinEpRet |            -552 |
|  AverageTestEpRet |            -596 |
|      StdTestEpRet |           0.889 |
|      MaxTestEpRet |            -594 |
|      MinTestEpRet 

t=22200, 15.700017929077148s
t=22400, 16.18571949005127s
t=22600, 16.51783037185669s
t=22800, 15.496562480926514s
t=23000, 15.679074764251709s
t=23200, 15.843634605407715s
t=23400, 15.517505168914795s
t=23600, 16.910781145095825s
t=23800, 16.763174533843994s
---------------------------------------
|             Epoch |               6 |
|      AverageEpRet |            -549 |
|          StdEpRet |           0.686 |
|          MaxEpRet |            -548 |
|          MinEpRet |            -550 |
|  AverageTestEpRet |            -596 |
|      StdTestEpRet |             1.7 |
|      MaxTestEpRet |            -593 |
|      MinTestEpRet |            -599 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.4e+04 |
|     AverageQ1Vals |           -2.18 |
|         StdQ1Vals |            7.41 |
|         MaxQ1Vals |            22.1 |
|         MinQ1Vals |           -33.3 |
|     AverageQ2Vals |           -2.18 |
|         StdQ2Vals |

t=44000, 89.72102308273315s
t=44200, 18.669743299484253s
t=44400, 18.267775774002075s
t=44600, 17.926068782806396s
t=44800, 20.6138756275177s
t=45000, 19.319339752197266s
t=45200, 19.745181560516357s
t=45400, 19.04065489768982s
t=45600, 18.284927368164062s
t=45800, 16.807057857513428s
t=46000, 17.48225474357605s
t=46200, 16.943692207336426s
t=46400, 16.821019411087036s
t=46600, 16.72826886177063s
t=46800, 17.359580278396606s
t=47000, 18.22027897834778s
t=47200, 18.32599639892578s
t=47400, 18.444711208343506s
t=47600, 18.62793755531311s
t=47800, 18.014822959899902s
---------------------------------------
|             Epoch |              12 |
|      AverageEpRet |           -61.7 |
|          StdEpRet |             540 |
|          MaxEpRet |             861 |
|          MinEpRet |            -502 |
|  AverageTestEpRet |             197 |
|      StdTestEpRet |             808 |
|      MaxTestEpRet |        1.21e+03 |
|      MinTestEpRet |            -591 |
|             EpLen |        

t=66600, 15.997993469238281s
t=66800, 17.345200538635254s
t=67000, 16.036118268966675s
t=67200, 15.932397365570068s
t=67400, 15.924418687820435s
t=67600, 15.875548601150513s
t=67800, 15.98126482963562s
---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |             779 |
|          StdEpRet |             985 |
|          MaxEpRet |        1.46e+03 |
|          MinEpRet |            -916 |
|  AverageTestEpRet |        1.31e+03 |
|      StdTestEpRet |             226 |
|      MaxTestEpRet |        1.55e+03 |
|      MinTestEpRet |             783 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.8e+04 |
|     AverageQ1Vals |            5.59 |
|         StdQ1Vals |            17.6 |
|         MaxQ1Vals |              59 |
|         MinQ1Vals |           -40.7 |
|     AverageQ2Vals |            5.59 |
|         StdQ2Vals |            17.6 |
|         MaxQ2Vals |            54.9 

t=88000, 101.96488785743713s
t=88200, 23.828743934631348s
t=88400, 23.560585021972656s
t=88600, 24.257877588272095s
t=88800, 21.606247425079346s
t=89000, 23.56199598312378s
t=89200, 25.678412199020386s
t=89400, 25.149527549743652s
t=89600, 17.315149307250977s
t=89800, 17.119255304336548s
t=90000, 17.294361352920532s
t=90200, 20.193349361419678s
t=90400, 20.1321861743927s
t=90600, 19.383883953094482s
t=90800, 20.22486114501953s
t=91000, 19.183156728744507s
t=91200, 19.36499786376953s
t=91400, 19.37776207923889s
t=91600, 19.892150402069092s
t=91800, 19.767404556274414s
---------------------------------------
|             Epoch |              23 |
|      AverageEpRet |         1.5e+03 |
|          StdEpRet |             102 |
|          MaxEpRet |        1.62e+03 |
|          MinEpRet |        1.36e+03 |
|  AverageTestEpRet |        1.46e+03 |
|      StdTestEpRet |            66.4 |
|      MaxTestEpRet |        1.62e+03 |
|      MinTestEpRet |        1.38e+03 |
|             EpLen |     

t=110000, 14.778481721878052s
t=110200, 14.831342220306396s
t=110400, 14.584002017974854s
t=110600, 15.501370429992676s
t=110800, 15.85360836982727s
t=111000, 15.228278636932373s
t=111200, 14.72363018989563s
t=111400, 14.877217769622803s
t=111600, 14.693708658218384s
t=111800, 14.714653491973877s
---------------------------------------
|             Epoch |              28 |
|      AverageEpRet |        1.57e+03 |
|          StdEpRet |             101 |
|          MaxEpRet |        1.65e+03 |
|          MinEpRet |         1.4e+03 |
|  AverageTestEpRet |        1.69e+03 |
|      StdTestEpRet |              74 |
|      MaxTestEpRet |        1.83e+03 |
|      MinTestEpRet |        1.57e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.12e+05 |
|     AverageQ1Vals |            23.9 |
|         StdQ1Vals |            25.4 |
|         MaxQ1Vals |            69.3 |
|         MinQ1Vals |           -46.5 |
|     AverageQ2Vals | 

t=131600, 15.670099020004272s
t=131800, 15.239249229431152s
---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |        1.71e+03 |
|          StdEpRet |              67 |
|          MaxEpRet |        1.79e+03 |
|          MinEpRet |        1.61e+03 |
|  AverageTestEpRet |         1.5e+03 |
|      StdTestEpRet |             711 |
|      MaxTestEpRet |        1.82e+03 |
|      MinTestEpRet |            -624 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.32e+05 |
|     AverageQ1Vals |            31.1 |
|         StdQ1Vals |              27 |
|         MaxQ1Vals |            75.4 |
|         MinQ1Vals |           -52.8 |
|     AverageQ2Vals |            31.1 |
|         StdQ2Vals |              27 |
|         MaxQ2Vals |            76.5 |
|         MinQ2Vals |           -50.1 |
|            LossPi |           -33.1 |
|             LossQ |            24.5 |
|              Time 

t=152000, 68.92270112037659s
t=152200, 14.462327241897583s
t=152400, 14.819373607635498s
t=152600, 14.826354265213013s
t=152800, 14.55508017539978s
t=153000, 14.638855218887329s
t=153200, 14.817378044128418s
t=153400, 14.782471179962158s
t=153600, 14.833336353302002s
t=153800, 14.980940580368042s
t=154000, 14.706675052642822s
t=154200, 14.713655233383179s
t=154400, 15.028812885284424s
t=154600, 14.54012155532837s
t=154800, 14.718641757965088s
t=155000, 14.579015493392944s
t=155200, 14.681739568710327s
t=155400, 14.57901668548584s
t=155600, 14.848294973373413s
t=155800, 14.673763036727905s
---------------------------------------
|             Epoch |              39 |
|      AverageEpRet |        1.52e+03 |
|          StdEpRet |             156 |
|          MaxEpRet |        1.75e+03 |
|          MinEpRet |        1.31e+03 |
|  AverageTestEpRet |        1.31e+03 |
|      StdTestEpRet |             719 |
|      MaxTestEpRet |        1.69e+03 |
|      MinTestEpRet |            -823 |
|   

t=173600, 14.64882779121399s
t=173800, 14.519176006317139s
t=174000, 14.685732126235962s
t=174200, 14.43639612197876s
t=174400, 14.82435917854309s
t=174600, 14.546104669570923s
t=174800, 14.631873369216919s
t=175000, 14.638855934143066s
t=175200, 15.238252401351929s
t=175400, 14.547101736068726s
t=175600, 14.8074049949646s
t=175800, 14.791447877883911s
---------------------------------------
|             Epoch |              44 |
|      AverageEpRet |        1.64e+03 |
|          StdEpRet |            60.6 |
|          MaxEpRet |         1.7e+03 |
|          MinEpRet |        1.57e+03 |
|  AverageTestEpRet |         1.8e+03 |
|      StdTestEpRet |            43.5 |
|      MaxTestEpRet |         1.9e+03 |
|      MinTestEpRet |        1.74e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.76e+05 |
|     AverageQ1Vals |            33.5 |
|         StdQ1Vals |            28.6 |
|         MaxQ1Vals |            74.5 |
|    

t=195200, 16.71069073677063s
t=195400, 16.121890783309937s
t=195600, 15.906466484069824s
t=195800, 15.718966722488403s
---------------------------------------
|             Epoch |              49 |
|      AverageEpRet |        1.62e+03 |
|          StdEpRet |            60.8 |
|          MaxEpRet |        1.71e+03 |
|          MinEpRet |        1.54e+03 |
|  AverageTestEpRet |         1.7e+03 |
|      StdTestEpRet |              35 |
|      MaxTestEpRet |        1.76e+03 |
|      MinTestEpRet |        1.63e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.96e+05 |
|     AverageQ1Vals |              37 |
|         StdQ1Vals |            28.6 |
|         MaxQ1Vals |            76.2 |
|         MinQ1Vals |             -63 |
|     AverageQ2Vals |              37 |
|         StdQ2Vals |            28.6 |
|         MaxQ2Vals |            76.2 |
|         MinQ2Vals |           -62.8 |
|            LossPi |           -38.5 |
|

In [72]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'lstm_td3_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze\lstm_td3_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetah-v2",
    "epochs":	50,
    "exp_name":	"lstm_td3_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000014E9E69EB88>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0



[32;1m
Number of parameters: 	 pi: 300934, 	 q1: 301057, 	 q2: 301057
[0m
t=0, 0.0s
t=200, 0.04089021682739258s
t=400, 0.024932861328125s
t=600, 0.03590536117553711s
t=800, 0.032910823822021484s
t=1000, 0.037899017333984375s
t=1200, 12.016866445541382s
t=1400, 12.405826807022095s
t=1600, 12.330028533935547s
t=1800, 12.92344331741333s
t=2000, 12.840664625167847s
t=2200, 13.170781135559082s
t=2400, 13.08401346206665s
t=2600, 12.681090593338013s
t=2800, 12.443725824356079s
t=3000, 13.541789293289185s
t=3200, 12.74791169166565s
t=3400, 12.678098440170288s
t=3600, 12.937406063079834s
t=3800, 13.117923021316528s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -400 |
|          StdEpRet |            85.9 |
|          MaxEpRet |            -275 |
|          MinEpRet |            -517 |
|  AverageTestEpRet |            -583 |
|      StdTestEpRet |            2.36 |
|      MaxTestEpRet |            -579 |
|      MinTestEpRet |   

t=22200, 14.343647480010986s
t=22400, 14.885196685791016s
t=22600, 14.431410312652588s
t=22800, 14.315717458724976s
t=23000, 14.77947998046875s
t=23200, 15.133532762527466s
t=23400, 14.785463571548462s
t=23600, 14.408471822738647s
t=23800, 15.711986064910889s
---------------------------------------
|             Epoch |               6 |
|      AverageEpRet |        1.82e+03 |
|          StdEpRet |            19.1 |
|          MaxEpRet |        1.85e+03 |
|          MinEpRet |        1.79e+03 |
|  AverageTestEpRet |        1.99e+03 |
|      StdTestEpRet |            30.6 |
|      MaxTestEpRet |        2.03e+03 |
|      MinTestEpRet |        1.94e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.4e+04 |
|     AverageQ1Vals |            8.71 |
|         StdQ1Vals |            19.2 |
|         MaxQ1Vals |            48.5 |
|         MinQ1Vals |           -13.7 |
|     AverageQ2Vals |            8.71 |
|         StdQ2Vals 

t=44000, 65.580637216568s
t=44200, 13.991586208343506s
t=44400, 14.037463903427124s
t=44600, 14.13819408416748s
t=44800, 14.444375991821289s
t=45000, 14.54710078239441s
t=45200, 14.120243072509766s
t=45400, 14.068380355834961s
t=45600, 14.073368310928345s
t=45800, 14.172106266021729s
t=46000, 14.454346656799316s
t=46200, 14.143182516098022s
t=46400, 14.185067415237427s
t=46600, 14.074367046356201s
t=46800, 14.356608390808105s
t=47000, 14.249896764755249s
t=47200, 14.203020334243774s
t=47400, 14.121240854263306s
t=47600, 14.323697328567505s
t=47800, 14.516183376312256s
---------------------------------------
|             Epoch |              12 |
|      AverageEpRet |        2.06e+03 |
|          StdEpRet |              28 |
|          MaxEpRet |        2.09e+03 |
|          MinEpRet |        2.02e+03 |
|  AverageTestEpRet |        2.24e+03 |
|      StdTestEpRet |            42.6 |
|      MaxTestEpRet |         2.3e+03 |
|      MinTestEpRet |        2.19e+03 |
|             EpLen |    

t=66400, 16.86490225791931s
t=66600, 18.39580988883972s
t=66800, 21.486353158950806s
t=67000, 18.59129238128662s
t=67200, 19.870859146118164s
t=67400, 19.990545511245728s
t=67600, 17.197015047073364s
t=67800, 18.109574794769287s
---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |        2.14e+03 |
|          StdEpRet |            37.1 |
|          MaxEpRet |         2.2e+03 |
|          MinEpRet |         2.1e+03 |
|  AverageTestEpRet |        2.39e+03 |
|      StdTestEpRet |            33.9 |
|      MaxTestEpRet |        2.45e+03 |
|      MinTestEpRet |        2.33e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.8e+04 |
|     AverageQ1Vals |             104 |
|         StdQ1Vals |            54.9 |
|         MaxQ1Vals |             141 |
|         MinQ1Vals |           -12.8 |
|     AverageQ2Vals |             104 |
|         StdQ2Vals |            54.9 |
|         M

t=88000, 70.37634015083313s
t=88200, 15.595479726791382s
t=88400, 15.616899013519287s
t=88600, 15.696027755737305s
t=88800, 14.58799147605896s
t=89000, 15.934393405914307s
t=89200, 15.573355436325073s
t=89400, 14.80840277671814s
t=89600, 14.518178701400757s
t=89800, 16.392166137695312s
t=90000, 15.392594575881958s
t=90200, 16.452760696411133s
t=90400, 16.37122130393982s
t=90600, 15.388853549957275s
t=90800, 15.929402112960815s
t=91000, 14.997896671295166s
t=91200, 14.663787841796875s
t=91400, 15.030808448791504s
t=91600, 16.922748804092407s
t=91800, 20.893131732940674s
---------------------------------------
|             Epoch |              23 |
|      AverageEpRet |        2.22e+03 |
|          StdEpRet |            31.6 |
|          MaxEpRet |        2.24e+03 |
|          MinEpRet |        2.16e+03 |
|  AverageTestEpRet |        2.46e+03 |
|      StdTestEpRet |            24.7 |
|      MaxTestEpRet |        2.51e+03 |
|      MinTestEpRet |        2.41e+03 |
|             EpLen |   

t=110000, 16.814348697662354s
t=110200, 23.301041841506958s
t=110400, 27.723571062088013s
t=110600, 19.919698476791382s
t=110800, 20.352080583572388s
t=111000, 18.319606065750122s
t=111200, 19.000738620758057s
t=111400, 17.918153524398804s
t=111600, 16.9336519241333s
t=111800, 15.445622682571411s
---------------------------------------
|             Epoch |              28 |
|      AverageEpRet |        2.27e+03 |
|          StdEpRet |            18.3 |
|          MaxEpRet |        2.29e+03 |
|          MinEpRet |        2.25e+03 |
|  AverageTestEpRet |        2.62e+03 |
|      StdTestEpRet |            12.6 |
|      MaxTestEpRet |        2.64e+03 |
|      MinTestEpRet |         2.6e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.12e+05 |
|     AverageQ1Vals |             146 |
|         StdQ1Vals |            62.4 |
|         MaxQ1Vals |             175 |
|         MinQ1Vals |             -34 |
|     AverageQ2Vals | 

t=131600, 14.165122270584106s
t=131800, 13.652493476867676s
---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |        2.35e+03 |
|          StdEpRet |            24.8 |
|          MaxEpRet |        2.39e+03 |
|          MinEpRet |        2.32e+03 |
|  AverageTestEpRet |        2.62e+03 |
|      StdTestEpRet |            14.6 |
|      MaxTestEpRet |        2.63e+03 |
|      MinTestEpRet |        2.59e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.32e+05 |
|     AverageQ1Vals |             158 |
|         StdQ1Vals |            61.5 |
|         MaxQ1Vals |             185 |
|         MinQ1Vals |           -40.4 |
|     AverageQ2Vals |             158 |
|         StdQ2Vals |            61.5 |
|         MaxQ2Vals |             185 |
|         MinQ2Vals |           -40.1 |
|            LossPi |            -158 |
|             LossQ |            1.75 |
|              Time 

t=152000, 67.22158932685852s
t=152200, 14.338282585144043s
t=152400, 14.074591636657715s
t=152600, 14.612963199615479s
t=152800, 14.046538591384888s
t=153000, 14.483813524246216s
t=153200, 14.025196552276611s
t=153400, 14.224223375320435s
t=153600, 14.236072063446045s
t=153800, 14.259339332580566s
t=154000, 14.350496053695679s
t=154200, 14.418009042739868s
t=154400, 14.380718231201172s
t=154600, 19.7655668258667s
t=154800, 17.865838050842285s
t=155000, 14.830806732177734s
t=155200, 15.316054582595825s
t=155400, 15.014057874679565s
t=155600, 14.959788084030151s
t=155800, 14.776179313659668s
---------------------------------------
|             Epoch |              39 |
|      AverageEpRet |         2.4e+03 |
|          StdEpRet |              24 |
|          MaxEpRet |        2.44e+03 |
|          MinEpRet |        2.37e+03 |
|  AverageTestEpRet |        2.69e+03 |
|      StdTestEpRet |            10.9 |
|      MaxTestEpRet |        2.71e+03 |
|      MinTestEpRet |        2.67e+03 |
|  

t=173600, 14.400227308273315s
t=173800, 14.787033796310425s
t=174000, 13.844861268997192s
t=174200, 16.066523790359497s
t=174400, 15.920411586761475s
t=174600, 15.256935834884644s
t=174800, 14.363590240478516s
t=175000, 15.776812553405762s
t=175200, 20.606730461120605s
t=175400, 14.832417964935303s
t=175600, 14.288862466812134s
t=175800, 14.447042465209961s
---------------------------------------
|             Epoch |              44 |
|      AverageEpRet |        2.44e+03 |
|          StdEpRet |            42.9 |
|          MaxEpRet |         2.5e+03 |
|          MinEpRet |        2.38e+03 |
|  AverageTestEpRet |        2.73e+03 |
|      StdTestEpRet |            19.5 |
|      MaxTestEpRet |        2.75e+03 |
|      MinTestEpRet |         2.7e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.76e+05 |
|     AverageQ1Vals |             176 |
|         StdQ1Vals |            58.5 |
|         MaxQ1Vals |             200 |


t=195200, 13.564749956130981s
t=195400, 13.740267038345337s
t=195600, 13.998536825180054s
t=195800, 13.890885353088379s
---------------------------------------
|             Epoch |              49 |
|      AverageEpRet |        2.51e+03 |
|          StdEpRet |            18.2 |
|          MaxEpRet |        2.53e+03 |
|          MinEpRet |        2.48e+03 |
|  AverageTestEpRet |         2.8e+03 |
|      StdTestEpRet |            17.4 |
|      MaxTestEpRet |        2.81e+03 |
|      MinTestEpRet |        2.76e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.96e+05 |
|     AverageQ1Vals |             183 |
|         StdQ1Vals |            56.5 |
|         MaxQ1Vals |             204 |
|         MinQ1Vals |           -51.1 |
|     AverageQ2Vals |             183 |
|         StdQ2Vals |            56.5 |
|         MaxQ2Vals |             204 |
|         MinQ2Vals |             -52 |
|            LossPi |            -183 |


In [64]:
args = {'env': 'HalfCheetahMuJoCoEnv-v0', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'lstm_td3_POMDP_HalfCheetahMuJoCoEnv_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetahMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"lstm_td3_POMDP_HalfCheetahMuJoCoEnv_PreLSTM1L128_HistMemory5Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000014EE8B617C8>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_POMDP_HalfCheetahMuJoCoEnv_PreLSTM



t=200, 0.14760327339172363s
t=400, 0.11871194839477539s
t=600, 0.10768318176269531s
t=800, 0.1107022762298584s
t=1000, 0.1156926155090332s
t=1200, 13.94173812866211s
t=1400, 13.785120010375977s
t=1600, 14.430413246154785s
t=1800, 14.205015659332275s
t=2000, 13.59667420387268s
t=2200, 13.814027786254883s
t=2400, 13.995576620101929s
t=2600, 14.195276498794556s
t=2800, 14.147671222686768s
t=3000, 14.612956285476685s
t=3200, 14.579894304275513s
t=3400, 14.427450180053711s
t=3600, 13.493884563446045s
t=3800, 13.382215023040771s
options= 
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -244 |
|          StdEpRet |             119 |
|          MaxEpRet |           -91.4 |
|          MinEpRet |            -422 |
|  AverageTestEpRet |            -128 |
|      StdTestEpRet |             153 |
|      MaxTestEpRet |           -1.82 |
|      MinTestEpRet |            -420 |
|             EpLen |           1e+03 |
|         TestEpLen |

t=22800, 15.443676948547363s
t=23000, 15.466642618179321s
t=23200, 15.60826301574707s
t=23400, 16.15579891204834s
t=23600, 15.759857654571533s
t=23800, 15.932843685150146s
---------------------------------------
|             Epoch |               6 |
|      AverageEpRet |             737 |
|          StdEpRet |             287 |
|          MaxEpRet |        1.04e+03 |
|          MinEpRet |             275 |
|  AverageTestEpRet |             815 |
|      StdTestEpRet |             388 |
|      MaxTestEpRet |             980 |
|      MinTestEpRet |            -347 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.4e+04 |
|     AverageQ1Vals |            8.35 |
|         StdQ1Vals |            5.73 |
|         MaxQ1Vals |            29.5 |
|         MinQ1Vals |           -7.18 |
|     AverageQ2Vals |            8.35 |
|         StdQ2Vals |            5.73 |
|         MaxQ2Vals |            29.3 |
|         MinQ2Vals |       

t=44000, 91.5921061038971s
t=44200, 14.621902227401733s
t=44400, 15.722955465316772s
t=44600, 15.317042589187622s
t=44800, 15.15547513961792s
t=45000, 14.567046880722046s
t=45200, 14.57704782485962s
t=45400, 15.02878451347351s
t=45600, 14.882236003875732s
t=45800, 14.675762414932251s
t=46000, 14.754509925842285s
t=46200, 14.631874084472656s
t=46400, 15.272164106369019s
t=46600, 15.087655544281006s
t=46800, 14.647831916809082s
t=47000, 14.623900175094604s
t=47200, 14.609928131103516s
t=47400, 14.694706916809082s
t=47600, 14.607964038848877s
t=47800, 14.864226579666138s
---------------------------------------
|             Epoch |              12 |
|      AverageEpRet |        1.24e+03 |
|          StdEpRet |              21 |
|          MaxEpRet |        1.25e+03 |
|          MinEpRet |         1.2e+03 |
|  AverageTestEpRet |        1.39e+03 |
|      StdTestEpRet |            32.2 |
|      MaxTestEpRet |        1.43e+03 |
|      MinTestEpRet |        1.34e+03 |
|             EpLen |    

t=66400, 15.060786485671997s
t=66600, 15.477430820465088s
t=66800, 14.942396879196167s
t=67000, 14.7827787399292s
t=67200, 16.198353052139282s
t=67400, 15.267760515213013s
t=67600, 15.22772216796875s
t=67800, 14.84449553489685s
---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |        1.36e+03 |
|          StdEpRet |            29.2 |
|          MaxEpRet |        1.41e+03 |
|          MinEpRet |        1.33e+03 |
|  AverageTestEpRet |        1.48e+03 |
|      StdTestEpRet |            29.4 |
|      MaxTestEpRet |        1.51e+03 |
|      MinTestEpRet |         1.4e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.8e+04 |
|     AverageQ1Vals |            64.4 |
|         StdQ1Vals |            17.4 |
|         MaxQ1Vals |            84.4 |
|         MinQ1Vals |           -13.3 |
|     AverageQ2Vals |            64.4 |
|         StdQ2Vals |            17.4 |
|         Ma

t=88000, 77.386155128479s
t=88200, 15.675174236297607s
t=88400, 15.627092123031616s
t=88600, 15.169292449951172s
t=88800, 15.001017808914185s
t=89000, 15.334828853607178s
t=89200, 15.729181051254272s
t=89400, 15.087392807006836s
t=89600, 15.325685739517212s
t=89800, 15.024855852127075s
t=90000, 15.089654207229614s
t=90200, 14.994181394577026s
t=90400, 15.499199390411377s
t=90600, 15.428845882415771s
t=90800, 15.456247091293335s
t=91000, 15.126335144042969s
t=91200, 15.570762395858765s
t=91400, 15.934002161026001s
t=91600, 15.059723854064941s
t=91800, 15.805964469909668s
---------------------------------------
|             Epoch |              23 |
|      AverageEpRet |        1.45e+03 |
|          StdEpRet |              21 |
|          MaxEpRet |        1.47e+03 |
|          MinEpRet |        1.41e+03 |
|  AverageTestEpRet |        1.52e+03 |
|      StdTestEpRet |            18.1 |
|      MaxTestEpRet |        1.54e+03 |
|      MinTestEpRet |        1.49e+03 |
|             EpLen |  

t=110000, 18.126509428024292s
t=110200, 18.57332706451416s
t=110400, 17.66376757621765s
t=110600, 17.413464069366455s
t=110800, 17.15110754966736s
t=111000, 17.403464794158936s
t=111200, 17.36160373687744s
t=111400, 17.2568461894989s
t=111600, 17.345598697662354s
t=111800, 14.791447162628174s
---------------------------------------
|             Epoch |              28 |
|      AverageEpRet |        1.46e+03 |
|          StdEpRet |            73.4 |
|          MaxEpRet |        1.52e+03 |
|          MinEpRet |        1.33e+03 |
|  AverageTestEpRet |        1.44e+03 |
|      StdTestEpRet |            33.4 |
|      MaxTestEpRet |        1.48e+03 |
|      MinTestEpRet |        1.38e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.12e+05 |
|     AverageQ1Vals |            97.8 |
|         StdQ1Vals |              17 |
|         MaxQ1Vals |             114 |
|         MinQ1Vals |            14.4 |
|     AverageQ2Vals |     

t=131600, 18.86355996131897s
t=131800, 18.6357901096344s
---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |        1.52e+03 |
|          StdEpRet |            20.1 |
|          MaxEpRet |        1.55e+03 |
|          MinEpRet |        1.49e+03 |
|  AverageTestEpRet |        1.56e+03 |
|      StdTestEpRet |            16.1 |
|      MaxTestEpRet |        1.58e+03 |
|      MinTestEpRet |        1.53e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.32e+05 |
|     AverageQ1Vals |             107 |
|         StdQ1Vals |            17.5 |
|         MaxQ1Vals |             123 |
|         MinQ1Vals |            19.3 |
|     AverageQ2Vals |             107 |
|         StdQ2Vals |            17.5 |
|         MaxQ2Vals |             123 |
|         MinQ2Vals |            19.5 |
|            LossPi |            -108 |
|             LossQ |            1.12 |
|              Time |  

t=152000, 80.79148864746094s
t=152200, 16.24755549430847s
t=152400, 16.088009119033813s
t=152600, 16.36022639274597s
t=152800, 16.335336208343506s
t=153000, 16.5178120136261s
t=153200, 16.134853839874268s
t=153400, 16.447021484375s
t=153600, 16.329363107681274s
t=153800, 16.307367086410522s
t=154000, 16.228602170944214s
t=154200, 16.433058261871338s
t=154400, 16.35629415512085s
t=154600, 16.446990728378296s
t=154800, 16.35626244544983s
t=155000, 16.23758292198181s
t=155200, 16.20267391204834s
t=155400, 16.420090675354004s
t=155600, 16.497883558273315s
t=155800, 15.083668231964111s
---------------------------------------
|             Epoch |              39 |
|      AverageEpRet |        1.51e+03 |
|          StdEpRet |            24.9 |
|          MaxEpRet |        1.55e+03 |
|          MinEpRet |        1.49e+03 |
|  AverageTestEpRet |        1.37e+03 |
|      StdTestEpRet |             657 |
|      MaxTestEpRet |        1.61e+03 |
|      MinTestEpRet |            -603 |
|           

t=173600, 14.58301067352295s
t=173800, 14.879212856292725s
t=174000, 15.435725927352905s
t=174200, 14.957031726837158s
t=174400, 15.358761072158813s
t=174600, 15.45168161392212s
t=174800, 14.545133590698242s
t=175000, 15.559396028518677s
t=175200, 14.45837116241455s
t=175400, 14.18012022972107s
t=175600, 14.133208274841309s
t=175800, 13.953686714172363s
---------------------------------------
|             Epoch |              44 |
|      AverageEpRet |        1.04e+03 |
|          StdEpRet |             860 |
|          MaxEpRet |        1.58e+03 |
|          MinEpRet |            -447 |
|  AverageTestEpRet |        1.39e+03 |
|      StdTestEpRet |             664 |
|      MaxTestEpRet |        1.69e+03 |
|      MinTestEpRet |            -571 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.76e+05 |
|     AverageQ1Vals |             118 |
|         StdQ1Vals |            24.4 |
|         MaxQ1Vals |             135 |
|   

t=195200, 14.105297088623047s
t=195400, 14.138603448867798s
t=195600, 13.833981037139893s
t=195800, 13.925790786743164s
---------------------------------------
|             Epoch |              49 |
|      AverageEpRet |        1.56e+03 |
|          StdEpRet |            19.8 |
|          MaxEpRet |        1.59e+03 |
|          MinEpRet |        1.53e+03 |
|  AverageTestEpRet |        1.57e+03 |
|      StdTestEpRet |              23 |
|      MaxTestEpRet |        1.59e+03 |
|      MinTestEpRet |        1.53e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |        1.96e+05 |
|     AverageQ1Vals |             121 |
|         StdQ1Vals |            24.6 |
|         MaxQ1Vals |             138 |
|         MinQ1Vals |           -17.8 |
|     AverageQ2Vals |             121 |
|         StdQ2Vals |            24.6 |
|         MaxQ2Vals |             138 |
|         MinQ2Vals |           -18.4 |
|            LossPi |            -122 |


In [50]:
args = {'env': 'HalfCheetahMuJoCoEnv-v0', 'hid': 256, 'l': 2, 
        'max_hist_len': 2,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'lstm_td3_POMDP_HalfCheetahMuJoCoEnv_PreLSTM1L128_HistMemory2Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_PreLSTM1L128_HistMemory2Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_PreLSTM1L128_HistMemory2Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetahMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"lstm_td3_POMDP_HalfCheetahMuJoCoEnv_PreLSTM1L128_HistMemory2Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000014EE8E62F88>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_POMDP_HalfCheetahMuJoCoEnv_PreLSTM

t=8000, 73.4842963218689s
t=8200, 13.09697699546814s
t=8400, 14.08733057975769s
t=8600, 13.882720470428467s
t=8800, 14.523284912109375s
t=9000, 14.658801078796387s
t=9200, 14.145505905151367s
t=9400, 13.461005687713623s
t=9600, 13.142856121063232s
t=9800, 13.727293014526367s
---------------------------------------
|             Epoch |               5 |
|      AverageEpRet |            -269 |
|          StdEpRet |              22 |
|          MaxEpRet |            -247 |
|          MinEpRet |            -291 |
|  AverageTestEpRet |            -226 |
|      StdTestEpRet |            94.1 |
|      MaxTestEpRet |           -74.9 |
|      MinTestEpRet |            -378 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+04 |
|     AverageQ1Vals |            2.72 |
|         StdQ1Vals |            5.87 |
|         MaxQ1Vals |            31.3 |
|         MinQ1Vals |            -5.5 |
|     AverageQ2Vals |            2.73 |
|   

t=22000, 72.06430125236511s
t=22200, 13.417121410369873s
t=22400, 13.270514965057373s
t=22600, 13.374237060546875s
t=22800, 13.40615177154541s
t=23000, 13.29245662689209s
t=23200, 13.852957010269165s
t=23400, 16.283457279205322s
t=23600, 14.258872032165527s
t=23800, 15.211162567138672s
---------------------------------------
|             Epoch |              12 |
|      AverageEpRet |           -12.2 |
|          StdEpRet |            3.12 |
|          MaxEpRet |           -9.11 |
|          MinEpRet |           -15.3 |
|  AverageTestEpRet |           -23.3 |
|      StdTestEpRet |            53.6 |
|      MaxTestEpRet |              18 |
|      MinTestEpRet |            -129 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.4e+04 |
|     AverageQ1Vals |            5.14 |
|         StdQ1Vals |            4.87 |
|         MaxQ1Vals |            38.6 |
|         MinQ1Vals |            -5.2 |
|     AverageQ2Vals |            

t=36000, 73.86747789382935s
t=36200, 13.064065217971802s
t=36400, 13.160807847976685s
t=36600, 13.462002515792847s
t=36800, 14.215990543365479s
t=37000, 13.836005926132202s
t=37200, 18.073309183120728s
t=37400, 16.776140451431274s
t=37600, 14.456376791000366s
t=37800, 13.98756456375122s
---------------------------------------
|             Epoch |              19 |
|      AverageEpRet |            24.7 |
|          StdEpRet |            36.4 |
|          MaxEpRet |            61.2 |
|          MinEpRet |           -11.7 |
|  AverageTestEpRet |           -24.1 |
|      StdTestEpRet |            16.5 |
|      MaxTestEpRet |           -6.43 |
|      MinTestEpRet |           -65.2 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.8e+04 |
|     AverageQ1Vals |            2.41 |
|         StdQ1Vals |            4.97 |
|         MaxQ1Vals |            26.1 |
|         MinQ1Vals |           -9.42 |
|     AverageQ2Vals |           

t=50000, 72.39743781089783s
t=50200, 13.228598833084106s
t=50400, 13.387202739715576s
t=50600, 13.23959732055664s
t=50800, 13.66545844078064s
t=51000, 13.354290246963501s
t=51200, 13.434077262878418s
t=51400, 13.470978498458862s
t=51600, 13.738264322280884s
t=51800, 13.38321304321289s
---------------------------------------
|             Epoch |              26 |
|      AverageEpRet |           -59.2 |
|          StdEpRet |              13 |
|          MaxEpRet |           -46.2 |
|          MinEpRet |           -72.2 |
|  AverageTestEpRet |            94.4 |
|      StdTestEpRet |            55.8 |
|      MaxTestEpRet |             231 |
|      MinTestEpRet |            14.3 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.2e+04 |
|     AverageQ1Vals |            2.14 |
|         StdQ1Vals |            6.41 |
|         MaxQ1Vals |            37.3 |
|         MinQ1Vals |           -18.4 |
|     AverageQ2Vals |            2

t=64000, 69.87714838981628s
t=64200, 12.877565145492554s
t=64400, 13.134877443313599s
t=64600, 13.287469387054443s
t=64800, 13.559741258621216s
t=65000, 13.309410810470581s
t=65200, 13.456018209457397s
t=65400, 13.549768209457397s
t=65600, 13.364264011383057s
t=65800, 13.676429033279419s
---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |           -9.73 |
|          StdEpRet |            4.12 |
|          MaxEpRet |           -5.61 |
|          MinEpRet |           -13.9 |
|  AverageTestEpRet |            4.75 |
|      StdTestEpRet |            8.23 |
|      MaxTestEpRet |            23.4 |
|      MinTestEpRet |           -7.37 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.6e+04 |
|     AverageQ1Vals |            4.41 |
|         StdQ1Vals |            5.46 |
|         MaxQ1Vals |            38.8 |
|         MinQ1Vals |           -14.1 |
|     AverageQ2Vals |          

t=78000, 70.4526538848877s
t=78200, 13.326366424560547s
t=78400, 13.258545637130737s
t=78600, 13.25056791305542s
t=78800, 13.362271070480347s
t=79000, 13.673436880111694s
t=79200, 13.540791511535645s
t=79400, 13.463996887207031s
t=79600, 13.272509098052979s
t=79800, 13.552759885787964s
---------------------------------------
|             Epoch |              40 |
|      AverageEpRet |             345 |
|          StdEpRet |            53.9 |
|          MaxEpRet |             399 |
|          MinEpRet |             291 |
|  AverageTestEpRet |             362 |
|      StdTestEpRet |            88.8 |
|      MaxTestEpRet |             506 |
|      MinTestEpRet |             226 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           8e+04 |
|     AverageQ1Vals |            3.69 |
|         StdQ1Vals |            6.89 |
|         MaxQ1Vals |            38.2 |
|         MinQ1Vals |             -15 |
|     AverageQ2Vals |            

t=92000, 70.81619620323181s
t=92200, 13.680391311645508s
t=92400, 13.439090728759766s
t=92600, 13.408147096633911s
t=92800, 13.410141468048096s
t=93000, 13.719315767288208s
t=93200, 13.502863883972168s
t=93400, 14.651848554611206s
t=93600, 13.530810356140137s
t=93800, 13.810080528259277s
---------------------------------------
|             Epoch |              47 |
|      AverageEpRet |             483 |
|          StdEpRet |            34.4 |
|          MaxEpRet |             517 |
|          MinEpRet |             448 |
|  AverageTestEpRet |             399 |
|      StdTestEpRet |             327 |
|      MaxTestEpRet |             681 |
|      MinTestEpRet |            -514 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         9.4e+04 |
|     AverageQ1Vals |            8.37 |
|         StdQ1Vals |             7.5 |
|         MaxQ1Vals |            43.4 |
|         MinQ1Vals |           -16.1 |
|     AverageQ2Vals |          

In [230]:
args = {'env': 'HalfCheetahMuJoCoEnv-v0', 'hid': 256, 'l': 2, 
        'max_hist_len': 2,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory2Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory2Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory2Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetahMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory2Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000021251572088>":	{
            "epoch_dict":	{},
           

t=8000, 70.25327467918396s
t=8200, 14.52138352394104s
t=8400, 15.066162586212158s
t=8600, 14.8419828414917s
t=8800, 14.387959957122803s
t=9000, 18.686047792434692s
t=9200, 20.70499873161316s
t=9400, 18.67637014389038s
t=9600, 18.217698574066162s
t=9800, 18.197914123535156s
---------------------------------------
|             Epoch |               5 |
|      AverageEpRet |           -69.1 |
|          StdEpRet |              26 |
|          MaxEpRet |           -43.1 |
|          MinEpRet |           -95.2 |
|  AverageTestEpRet |            -367 |
|      StdTestEpRet |            27.4 |
|      MaxTestEpRet |            -341 |
|      MinTestEpRet |            -445 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+04 |
|     AverageQ1Vals |          -0.778 |
|         StdQ1Vals |             5.9 |
|         MaxQ1Vals |            32.2 |
|         MinQ1Vals |           -11.8 |
|     AverageQ2Vals |          -0.778 |
|     

t=22000, 77.58807682991028s
t=22200, 15.429027318954468s
t=22400, 16.34403419494629s
t=22600, 15.134958744049072s
t=22800, 15.61205768585205s
t=23000, 15.061781406402588s
t=23200, 15.037230730056763s
t=23400, 14.86262321472168s
t=23600, 14.999375820159912s
t=23800, 14.964427709579468s
---------------------------------------
|             Epoch |              12 |
|      AverageEpRet |            -116 |
|          StdEpRet |            65.2 |
|          MaxEpRet |           -50.6 |
|          MinEpRet |            -181 |
|  AverageTestEpRet |            -107 |
|      StdTestEpRet |            99.8 |
|      MaxTestEpRet |           -23.3 |
|      MinTestEpRet |            -351 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.4e+04 |
|     AverageQ1Vals |             2.4 |
|         StdQ1Vals |             7.3 |
|         MaxQ1Vals |            37.5 |
|         MinQ1Vals |           -17.1 |
|     AverageQ2Vals |             

t=36000, 88.40036845207214s
t=36200, 17.139941215515137s
t=36400, 17.461270093917847s
t=36600, 17.52900981903076s
t=36800, 17.42273235321045s
t=37000, 17.16794228553772s
t=37200, 17.804227828979492s
t=37400, 17.39676332473755s
t=37600, 17.494216203689575s
t=37800, 17.02645778656006s
---------------------------------------
|             Epoch |              19 |
|      AverageEpRet |           -56.9 |
|          StdEpRet |            22.5 |
|          MaxEpRet |           -34.5 |
|          MinEpRet |           -79.4 |
|  AverageTestEpRet |           -65.4 |
|      StdTestEpRet |            40.7 |
|      MaxTestEpRet |            30.1 |
|      MinTestEpRet |            -125 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.8e+04 |
|     AverageQ1Vals |         -0.0203 |
|         StdQ1Vals |            5.61 |
|         MaxQ1Vals |            53.7 |
|         MinQ1Vals |           -29.9 |
|     AverageQ2Vals |         -0.020

t=50000, 93.44942426681519s
t=50200, 18.860031843185425s
t=50400, 18.70378541946411s
t=50600, 19.081193208694458s
t=50800, 18.949068307876587s
t=51000, 18.851338624954224s
t=51200, 20.095632076263428s
t=51400, 19.304933071136475s
t=51600, 18.21287965774536s
t=51800, 19.42193365097046s


KeyboardInterrupt: 

In [18]:
args = {'env': 'HalfCheetahMuJoCoEnv-v0', 'hid': 256, 'l': 2, 
        'max_hist_len': 2,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory2Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory2Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory2Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetahMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory2Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000002123D3417C8>":	{
            "epoch_dict":	{},
           



[32;1m
Number of parameters: 	 pi: 237574, 	 q1: 237697, 	 q2: 237697
[0m
options= 
t=0, 0.0s
t=200, 0.14075636863708496s
t=400, 0.15365266799926758s
t=600, 0.11672449111938477s
t=800, 0.1351473331451416s
t=1000, 0.1296710968017578s
t=1200, 12.290297508239746s
t=1400, 11.70160698890686s
t=1600, 11.81266474723816s
t=1800, 11.789840459823608s
options= 
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -188 |
|          StdEpRet |            51.3 |
|          MaxEpRet |            -137 |
|          MinEpRet |            -239 |
|  AverageTestEpRet |            -228 |
|      StdTestEpRet |            70.7 |
|      MaxTestEpRet |            -166 |
|      MinTestEpRet |            -422 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |         0.00181 |
|         StdQ1Vals |           0.726 |
|         MaxQ1Vals |            4.32 |
|    

t=14000, 71.01084017753601s
t=14200, 13.73158884048462s
t=14400, 13.884454250335693s
t=14600, 14.63728380203247s
t=14800, 16.027279138565063s
t=15000, 13.48826265335083s
t=15200, 13.348370552062988s
t=15400, 13.56716799736023s
t=15600, 15.873248100280762s
t=15800, 14.00564169883728s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |            -156 |
|          StdEpRet |            10.6 |
|          MaxEpRet |            -145 |
|          MinEpRet |            -166 |
|  AverageTestEpRet |            -228 |
|      StdTestEpRet |             164 |
|      MaxTestEpRet |           -62.4 |
|      MinTestEpRet |            -554 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |             4.7 |
|         StdQ1Vals |            3.75 |
|         MaxQ1Vals |            28.2 |
|         MinQ1Vals |           -18.5 |
|     AverageQ2Vals |             4.

t=28000, 66.86617398262024s
t=28200, 13.396617412567139s
t=28400, 13.610673904418945s
t=28600, 13.524107933044434s
t=28800, 13.49268627166748s
t=29000, 14.462931156158447s
t=29200, 13.388007640838623s
t=29400, 13.100125789642334s
t=29600, 13.776538372039795s
t=29800, 13.75984263420105s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |              50 |
|          StdEpRet |            90.2 |
|          MaxEpRet |             140 |
|          MinEpRet |           -40.2 |
|  AverageTestEpRet |            72.5 |
|      StdTestEpRet |             131 |
|      MaxTestEpRet |             262 |
|      MinTestEpRet |            -125 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |            4.17 |
|         StdQ1Vals |            5.25 |
|         MaxQ1Vals |            25.3 |
|         MinQ1Vals |           -18.6 |
|     AverageQ2Vals |            

t=42000, 64.99875545501709s
t=42200, 13.390504360198975s
t=42400, 13.356026649475098s
t=42600, 13.28185749053955s
t=42800, 13.238245487213135s
t=43000, 13.421987295150757s
t=43200, 13.656234741210938s
t=43400, 13.51181960105896s
t=43600, 14.168601036071777s
t=43800, 13.650511503219604s
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |             449 |
|          StdEpRet |            22.2 |
|          MaxEpRet |             471 |
|          MinEpRet |             427 |
|  AverageTestEpRet |             450 |
|      StdTestEpRet |            42.8 |
|      MaxTestEpRet |             503 |
|      MinTestEpRet |             354 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |            10.2 |
|         StdQ1Vals |            9.61 |
|         MaxQ1Vals |            36.1 |
|         MinQ1Vals |           -30.1 |
|     AverageQ2Vals |            

t=56000, 64.86662435531616s
t=56200, 13.55233883857727s
t=56400, 13.410022974014282s
t=56600, 13.350775003433228s
t=56800, 13.018118619918823s
t=57000, 13.504088401794434s
t=57200, 13.226051807403564s
t=57400, 13.594684362411499s
t=57600, 13.246108293533325s
t=57800, 13.864711046218872s
---------------------------------------
|             Epoch |              29 |
|      AverageEpRet |             841 |
|          StdEpRet |            2.84 |
|          MaxEpRet |             843 |
|          MinEpRet |             838 |
|  AverageTestEpRet |             739 |
|      StdTestEpRet |            42.2 |
|      MaxTestEpRet |             776 |
|      MinTestEpRet |             644 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.8e+04 |
|     AverageQ1Vals |            20.5 |
|         StdQ1Vals |            15.5 |
|         MaxQ1Vals |            54.5 |
|         MinQ1Vals |           -48.2 |
|     AverageQ2Vals |           

t=70000, 64.64474892616272s
t=70200, 13.101389646530151s
t=70400, 13.258714199066162s
t=70600, 13.18933916091919s
t=70800, 13.569938898086548s
t=71000, 13.182675838470459s
t=71200, 13.044954061508179s
t=71400, 13.453680038452148s
t=71600, 13.143050909042358s
t=71800, 13.095693111419678s
---------------------------------------
|             Epoch |              36 |
|      AverageEpRet |        1.19e+03 |
|          StdEpRet |            45.1 |
|          MaxEpRet |        1.24e+03 |
|          MinEpRet |        1.15e+03 |
|  AverageTestEpRet |             729 |
|      StdTestEpRet |             763 |
|      MaxTestEpRet |        1.31e+03 |
|      MinTestEpRet |            -473 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         7.2e+04 |
|     AverageQ1Vals |            34.2 |
|         StdQ1Vals |              23 |
|         MaxQ1Vals |            73.7 |
|         MinQ1Vals |           -80.8 |
|     AverageQ2Vals |           

t=84000, 64.61974310874939s
t=84200, 13.220792055130005s
t=84400, 13.295046329498291s
t=84600, 13.08495545387268s
t=84800, 13.202491760253906s
t=85000, 13.295173645019531s
t=85200, 13.51648497581482s
t=85400, 13.26122260093689s
t=85600, 13.282689094543457s
t=85800, 13.381010055541992s
---------------------------------------
|             Epoch |              43 |
|      AverageEpRet |        1.21e+03 |
|          StdEpRet |             145 |
|          MaxEpRet |        1.36e+03 |
|          MinEpRet |        1.07e+03 |
|  AverageTestEpRet |             881 |
|      StdTestEpRet |             459 |
|      MaxTestEpRet |        1.19e+03 |
|      MinTestEpRet |            -479 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         8.6e+04 |
|     AverageQ1Vals |              42 |
|         StdQ1Vals |              35 |
|         MaxQ1Vals |            92.6 |
|         MinQ1Vals |           -99.1 |
|     AverageQ2Vals |             

t=98000, 63.54804277420044s
t=98200, 12.847546815872192s
t=98400, 13.402037858963013s
t=98600, 13.113879919052124s
t=98800, 13.405014276504517s
t=99000, 13.106748342514038s
t=99200, 13.446576833724976s
t=99400, 13.181015729904175s
t=99600, 13.147696256637573s
t=99800, 13.493727922439575s
---------------------------------------
|             Epoch |              50 |
|      AverageEpRet |        1.31e+03 |
|          StdEpRet |            83.4 |
|          MaxEpRet |        1.39e+03 |
|          MinEpRet |        1.22e+03 |
|  AverageTestEpRet |        1.14e+03 |
|      StdTestEpRet |            58.5 |
|      MaxTestEpRet |        1.21e+03 |
|      MinTestEpRet |        1.03e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+05 |
|     AverageQ1Vals |            54.8 |
|         StdQ1Vals |            40.4 |
|         MaxQ1Vals |             108 |
|         MinQ1Vals |            -103 |
|     AverageQ2Vals |          

In [16]:
args = {'env': 'HalfCheetahMuJoCoEnv-v0', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetahMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x00000211BB6C0C08>":	{
            "epoch_dict":	{},
        



[32;1m
Number of parameters: 	 pi: 237574, 	 q1: 237697, 	 q2: 237697
[0m
options= 
t=0, 0.0s
t=200, 0.10973834991455078s
t=400, 0.10990047454833984s
t=600, 0.10532474517822266s
t=800, 0.10705900192260742s
t=1000, 0.11566376686096191s
t=1200, 16.484798669815063s
t=1400, 11.702100038528442s
t=1600, 11.40121054649353s
t=1800, 11.314529418945312s
options= 
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -195 |
|          StdEpRet |            39.5 |
|          MaxEpRet |            -156 |
|          MinEpRet |            -235 |
|  AverageTestEpRet |           -95.4 |
|      StdTestEpRet |            63.8 |
|      MaxTestEpRet |           -32.2 |
|      MinTestEpRet |            -268 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.227 |
|         StdQ1Vals |           0.572 |
|         MaxQ1Vals |             2.9 |
| 

t=14000, 77.8524386882782s
t=14200, 14.603621006011963s
t=14400, 14.139856100082397s
t=14600, 14.071349620819092s
t=14800, 14.0361487865448s
t=15000, 14.258620023727417s
t=15200, 13.85234785079956s
t=15400, 14.122132778167725s
t=15600, 14.10560941696167s
t=15800, 13.709672451019287s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |           -65.6 |
|          StdEpRet |            14.5 |
|          MaxEpRet |           -51.1 |
|          MinEpRet |           -80.1 |
|  AverageTestEpRet |           -87.3 |
|      StdTestEpRet |            17.4 |
|      MaxTestEpRet |           -40.8 |
|      MinTestEpRet |            -106 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |            1.69 |
|         StdQ1Vals |            4.43 |
|         MaxQ1Vals |            25.2 |
|         MinQ1Vals |           -8.06 |
|     AverageQ2Vals |            1.6

t=28000, 71.12080931663513s
t=28200, 14.200768232345581s
t=28400, 14.115466594696045s
t=28600, 13.879297494888306s
t=28800, 13.865917921066284s
t=29000, 16.309810876846313s
t=29200, 14.970794916152954s
t=29400, 13.964596033096313s
t=29600, 13.854714632034302s
t=29800, 14.072706460952759s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |             225 |
|          StdEpRet |             144 |
|          MaxEpRet |             369 |
|          MinEpRet |            81.6 |
|  AverageTestEpRet |           -27.8 |
|      StdTestEpRet |             443 |
|      MaxTestEpRet |             665 |
|      MinTestEpRet |            -683 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |            5.05 |
|         StdQ1Vals |            4.97 |
|         MaxQ1Vals |            27.8 |
|         MinQ1Vals |           -9.07 |
|     AverageQ2Vals |          

t=42000, 72.8751893043518s
t=42200, 13.790623188018799s
t=42400, 14.260897874832153s
t=42600, 13.69141149520874s
t=42800, 13.949153184890747s
t=43000, 13.749274730682373s
t=43200, 13.775011539459229s
t=43400, 13.758500099182129s
t=43600, 13.841084957122803s
t=43800, 14.11214804649353s
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |             841 |
|          StdEpRet |            6.77 |
|          MaxEpRet |             848 |
|          MinEpRet |             835 |
|  AverageTestEpRet |             784 |
|      StdTestEpRet |            39.8 |
|      MaxTestEpRet |             855 |
|      MinTestEpRet |             727 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |              16 |
|         StdQ1Vals |            10.7 |
|         MaxQ1Vals |            38.3 |
|         MinQ1Vals |           -31.9 |
|     AverageQ2Vals |             

t=56000, 68.55168962478638s
t=56200, 13.593414545059204s
t=56400, 13.706295251846313s
t=56600, 13.85551929473877s
t=56800, 13.930777788162231s
t=57000, 13.73952579498291s
t=57200, 13.745540380477905s
t=57400, 13.756092071533203s
t=57600, 14.170276880264282s
t=57800, 13.817581176757812s
---------------------------------------
|             Epoch |              29 |
|      AverageEpRet |        1.05e+03 |
|          StdEpRet |             2.3 |
|          MaxEpRet |        1.05e+03 |
|          MinEpRet |        1.04e+03 |
|  AverageTestEpRet |        1.03e+03 |
|      StdTestEpRet |            10.9 |
|      MaxTestEpRet |        1.06e+03 |
|      MinTestEpRet |        1.02e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.8e+04 |
|     AverageQ1Vals |            32.9 |
|         StdQ1Vals |            15.3 |
|         MaxQ1Vals |            53.3 |
|         MinQ1Vals |           -17.3 |
|     AverageQ2Vals |            

t=70000, 79.94243860244751s
t=70200, 16.13067626953125s
t=70400, 15.44709825515747s
t=70600, 15.424850702285767s
t=70800, 14.864738464355469s
t=71000, 15.174195766448975s
t=71200, 14.63388705253601s
t=71400, 15.00584363937378s
t=71600, 15.070948123931885s
t=71800, 15.571102857589722s
---------------------------------------
|             Epoch |              36 |
|      AverageEpRet |        1.02e+03 |
|          StdEpRet |            26.9 |
|          MaxEpRet |        1.05e+03 |
|          MinEpRet |             998 |
|  AverageTestEpRet |        1.02e+03 |
|      StdTestEpRet |            18.8 |
|      MaxTestEpRet |        1.05e+03 |
|      MinTestEpRet |             985 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         7.2e+04 |
|     AverageQ1Vals |            49.2 |
|         StdQ1Vals |            20.1 |
|         MaxQ1Vals |            70.8 |
|         MinQ1Vals |           -31.3 |
|     AverageQ2Vals |            49

t=84000, 76.10891652107239s
t=84200, 14.358106851577759s
t=84400, 14.64213228225708s
t=84600, 14.664403676986694s
t=84800, 14.996842622756958s
t=85000, 14.338292121887207s
t=85200, 15.359742403030396s
t=85400, 15.432341575622559s
t=85600, 14.885995626449585s
t=85800, 14.920530796051025s
---------------------------------------
|             Epoch |              43 |
|      AverageEpRet |        1.07e+03 |
|          StdEpRet |            11.7 |
|          MaxEpRet |        1.08e+03 |
|          MinEpRet |        1.05e+03 |
|  AverageTestEpRet |        1.27e+03 |
|      StdTestEpRet |            24.5 |
|      MaxTestEpRet |        1.33e+03 |
|      MinTestEpRet |        1.25e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         8.6e+04 |
|     AverageQ1Vals |            62.6 |
|         StdQ1Vals |            23.1 |
|         MaxQ1Vals |            83.8 |
|         MinQ1Vals |           -39.9 |
|     AverageQ2Vals |           

t=98000, 76.54272985458374s
t=98200, 14.000662088394165s
t=98400, 14.435521602630615s
t=98600, 14.092873334884644s
t=98800, 13.997748851776123s
t=99000, 14.072742938995361s
t=99200, 14.191528081893921s
t=99400, 14.168590784072876s
t=99600, 15.385656833648682s
t=99800, 14.574245929718018s
---------------------------------------
|             Epoch |              50 |
|      AverageEpRet |        1.17e+03 |
|          StdEpRet |            25.5 |
|          MaxEpRet |         1.2e+03 |
|          MinEpRet |        1.15e+03 |
|  AverageTestEpRet |        1.14e+03 |
|      StdTestEpRet |             547 |
|      MaxTestEpRet |        1.35e+03 |
|      MinTestEpRet |            -500 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+05 |
|     AverageQ1Vals |            73.7 |
|         StdQ1Vals |            25.2 |
|         MaxQ1Vals |            94.2 |
|         MinQ1Vals |           -61.9 |
|     AverageQ2Vals |          

In [14]:
args = {'env': 'HalfCheetahMuJoCoEnv-v0', 'hid': 256, 'l': 2, 
        'max_hist_len': 10,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetahMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000021199B8C048>":	{
            "epoch_dict":	{},
        



t=200, 0.1466066837310791s
t=400, 0.11620259284973145s
t=600, 0.11320614814758301s
t=800, 0.10638809204101562s
t=1000, 0.12526369094848633s
t=1200, 12.41319227218628s
t=1400, 12.745699405670166s
t=1600, 12.827036142349243s
t=1800, 12.869219779968262s
options= 
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -167 |
|          StdEpRet |            40.6 |
|          MaxEpRet |            -126 |
|          MinEpRet |            -208 |
|  AverageTestEpRet |           -63.4 |
|      StdTestEpRet |            18.6 |
|      MaxTestEpRet |             -27 |
|      MinTestEpRet |           -84.6 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.241 |
|         StdQ1Vals |           0.674 |
|         MaxQ1Vals |            2.72 |
|         MinQ1Vals |           -4.83 |
|     AverageQ2Vals |          -0.241 |
|         StdQ2Vals

t=14000, 67.66913032531738s
t=14200, 13.958319664001465s
t=14400, 14.0866858959198s
t=14600, 14.454351663589478s
t=14800, 13.868379354476929s
t=15000, 14.032731056213379s
t=15200, 14.098395109176636s
t=15400, 13.812031507492065s
t=15600, 13.939192295074463s
t=15800, 14.348578691482544s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |            -140 |
|          StdEpRet |            33.8 |
|          MaxEpRet |            -106 |
|          MinEpRet |            -174 |
|  AverageTestEpRet |            -194 |
|      StdTestEpRet |             146 |
|      MaxTestEpRet |           -26.8 |
|      MinTestEpRet |            -434 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |          -0.565 |
|         StdQ1Vals |             3.7 |
|         MaxQ1Vals |            23.5 |
|         MinQ1Vals |           -14.2 |
|     AverageQ2Vals |          -0

t=28000, 70.53764295578003s
t=28200, 13.943168878555298s
t=28400, 14.128818273544312s
t=28600, 14.154765605926514s
t=28800, 14.183420658111572s
t=29000, 14.08231234550476s
t=29200, 14.296974420547485s
t=29400, 15.475553035736084s
t=29600, 14.554303169250488s
t=29800, 13.893598556518555s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |            -143 |
|          StdEpRet |            26.4 |
|          MaxEpRet |            -117 |
|          MinEpRet |            -170 |
|  AverageTestEpRet |           -15.4 |
|      StdTestEpRet |              11 |
|      MaxTestEpRet |            2.45 |
|      MinTestEpRet |           -29.5 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |          -0.964 |
|         StdQ1Vals |            5.32 |
|         MaxQ1Vals |            26.7 |
|         MinQ1Vals |             -30 |
|     AverageQ2Vals |          -

t=42000, 62.56284737586975s
t=42200, 13.035067796707153s
t=42400, 12.982156753540039s
t=42600, 12.77749490737915s
t=42800, 12.838216066360474s
t=43000, 13.090308904647827s
t=43200, 13.075007677078247s
t=43400, 13.016329765319824s
t=43600, 13.015838861465454s
t=43800, 12.951323509216309s
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |            -239 |
|          StdEpRet |             214 |
|          MaxEpRet |           -24.7 |
|          MinEpRet |            -452 |
|  AverageTestEpRet |           -52.1 |
|      StdTestEpRet |            65.8 |
|      MaxTestEpRet |            4.49 |
|      MinTestEpRet |            -193 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |           -1.68 |
|         StdQ1Vals |            6.61 |
|         MaxQ1Vals |            39.5 |
|         MinQ1Vals |           -22.6 |
|     AverageQ2Vals |           

t=56000, 81.94614458084106s
t=56200, 15.138137340545654s
t=56400, 14.857024192810059s
t=56600, 15.6658353805542s
t=56800, 15.44399619102478s
t=57000, 15.287506103515625s
t=57200, 15.201412916183472s
t=57400, 14.3042631149292s
t=57600, 14.751038312911987s
t=57800, 14.755982398986816s
---------------------------------------
|             Epoch |              29 |
|      AverageEpRet |           -89.8 |
|          StdEpRet |            53.6 |
|          MaxEpRet |           -36.2 |
|          MinEpRet |            -143 |
|  AverageTestEpRet |           -91.4 |
|      StdTestEpRet |            53.7 |
|      MaxTestEpRet |            21.9 |
|      MinTestEpRet |            -160 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.8e+04 |
|     AverageQ1Vals |           -1.52 |
|         StdQ1Vals |            7.37 |
|         MaxQ1Vals |            40.9 |
|         MinQ1Vals |           -38.3 |
|     AverageQ2Vals |           -1.5

t=70000, 70.0077772140503s
t=70200, 15.46160101890564s
t=70400, 16.469299793243408s
t=70600, 15.835710287094116s
t=70800, 14.62963080406189s
t=71000, 15.053450345993042s
t=71200, 14.529408931732178s
t=71400, 14.642811298370361s
t=71600, 14.569302797317505s
t=71800, 14.67489767074585s
---------------------------------------
|             Epoch |              36 |
|      AverageEpRet |           -39.5 |
|          StdEpRet |            51.5 |
|          MaxEpRet |              12 |
|          MinEpRet |             -91 |
|  AverageTestEpRet |           -81.3 |
|      StdTestEpRet |             200 |
|      MaxTestEpRet |             262 |
|      MinTestEpRet |            -402 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         7.2e+04 |
|     AverageQ1Vals |           -3.14 |
|         StdQ1Vals |            6.63 |
|         MaxQ1Vals |            45.4 |
|         MinQ1Vals |           -40.8 |
|     AverageQ2Vals |           -3.

t=84000, 67.25007605552673s
t=84200, 13.757949352264404s
t=84400, 14.114475727081299s
t=84600, 14.090308666229248s
t=84800, 14.30353593826294s
t=85000, 13.900217771530151s
t=85200, 13.679858207702637s
t=85400, 14.127557277679443s
t=85600, 13.925861120223999s
t=85800, 14.37044644355774s
---------------------------------------
|             Epoch |              43 |
|      AverageEpRet |            20.4 |
|          StdEpRet |            9.62 |
|          MaxEpRet |              30 |
|          MinEpRet |            10.8 |
|  AverageTestEpRet |           0.127 |
|      StdTestEpRet |            19.1 |
|      MaxTestEpRet |            26.6 |
|      MinTestEpRet |           -42.6 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         8.6e+04 |
|     AverageQ1Vals |           -2.75 |
|         StdQ1Vals |            7.01 |
|         MaxQ1Vals |            47.7 |
|         MinQ1Vals |           -29.8 |
|     AverageQ2Vals |           -

KeyboardInterrupt: 

In [11]:
args = {'env': 'HalfCheetahMuJoCoEnv-v0', 'hid': 256, 'l': 2, 
        'max_hist_len': 10,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetahMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000002123D345488>":	{
            "epoch_dict":	{},
        



t=200, 0.15609240531921387s
t=400, 0.11323356628417969s
t=600, 0.12463951110839844s
t=800, 0.11070466041564941s
t=1000, 0.10970520973205566s
t=1200, 10.865439653396606s
t=1400, 11.030997514724731s
t=1600, 11.160873651504517s
t=1800, 11.090075492858887s
options= 
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -170 |
|          StdEpRet |            27.9 |
|          MaxEpRet |            -142 |
|          MinEpRet |            -198 |
|  AverageTestEpRet |           -56.8 |
|      StdTestEpRet |             123 |
|      MaxTestEpRet |            19.8 |
|      MinTestEpRet |            -418 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.206 |
|         StdQ1Vals |           0.702 |
|         MaxQ1Vals |            3.31 |
|         MinQ1Vals |           -4.61 |
|     AverageQ2Vals |          -0.207 |
|         StdQ2Va

t=14000, 66.14783811569214s
t=14200, 12.788678169250488s
t=14400, 12.275110960006714s
t=14600, 12.199377298355103s
t=14800, 12.148428678512573s
t=15000, 12.351642847061157s
t=15200, 12.40265703201294s
t=15400, 12.190274477005005s
t=15600, 12.248313903808594s
t=15800, 12.990964412689209s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |           -83.3 |
|          StdEpRet |            60.7 |
|          MaxEpRet |           -22.6 |
|          MinEpRet |            -144 |
|  AverageTestEpRet |             -85 |
|      StdTestEpRet |             176 |
|      MaxTestEpRet |              55 |
|      MinTestEpRet |            -539 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |            1.14 |
|         StdQ1Vals |            4.25 |
|         MaxQ1Vals |            29.3 |
|         MinQ1Vals |           -14.9 |
|     AverageQ2Vals |           

t=28000, 67.07340908050537s
t=28200, 12.321225881576538s
t=28400, 12.584486961364746s
t=28600, 12.588040828704834s
t=28800, 12.17489218711853s
t=29000, 12.931479454040527s
t=29200, 12.331161975860596s
t=29400, 12.549829006195068s
t=29600, 12.568321466445923s
t=29800, 12.139040231704712s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |           -89.8 |
|          StdEpRet |            71.9 |
|          MaxEpRet |           -17.9 |
|          MinEpRet |            -162 |
|  AverageTestEpRet |             360 |
|      StdTestEpRet |             156 |
|      MaxTestEpRet |             506 |
|      MinTestEpRet |           -79.9 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |            2.89 |
|         StdQ1Vals |            5.56 |
|         MaxQ1Vals |            34.6 |
|         MinQ1Vals |           -19.6 |
|     AverageQ2Vals |           

t=42000, 66.4321653842926s
t=42200, 12.851767778396606s
t=42400, 12.98999834060669s
t=42600, 12.18759298324585s
t=42800, 12.372651815414429s
t=43000, 12.028695583343506s
t=43200, 12.146288633346558s
t=43400, 12.19094729423523s
t=43600, 12.473945617675781s
t=43800, 11.89150619506836s
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |             882 |
|          StdEpRet |            97.1 |
|          MaxEpRet |             979 |
|          MinEpRet |             785 |
|  AverageTestEpRet |             759 |
|      StdTestEpRet |            24.6 |
|      MaxTestEpRet |             802 |
|      MinTestEpRet |             720 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |            10.5 |
|         StdQ1Vals |            9.97 |
|         MaxQ1Vals |            43.8 |
|         MinQ1Vals |           -28.4 |
|     AverageQ2Vals |            10.

t=56000, 68.14594101905823s
t=56200, 12.313118934631348s
t=56400, 12.454283714294434s
t=56600, 12.268475770950317s
t=56800, 12.684221029281616s
t=57000, 13.001545429229736s
t=57200, 12.35200309753418s
t=57400, 14.558909893035889s
t=57600, 12.33858346939087s
t=57800, 12.243001699447632s
---------------------------------------
|             Epoch |              29 |
|      AverageEpRet |        1.17e+03 |
|          StdEpRet |            23.4 |
|          MaxEpRet |        1.19e+03 |
|          MinEpRet |        1.14e+03 |
|  AverageTestEpRet |         1.2e+03 |
|      StdTestEpRet |            47.8 |
|      MaxTestEpRet |        1.25e+03 |
|      MinTestEpRet |        1.07e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.8e+04 |
|     AverageQ1Vals |            25.8 |
|         StdQ1Vals |            18.8 |
|         MaxQ1Vals |            61.1 |
|         MinQ1Vals |           -29.2 |
|     AverageQ2Vals |            

t=70000, 78.07277035713196s
t=70200, 12.831007957458496s
t=70400, 14.100029706954956s
t=70600, 12.831621408462524s
t=70800, 12.860393285751343s
t=71000, 14.562692403793335s
t=71200, 13.309882879257202s
t=71400, 13.2172110080719s
t=71600, 13.351751804351807s
t=71800, 14.508796691894531s
---------------------------------------
|             Epoch |              36 |
|      AverageEpRet |        1.15e+03 |
|          StdEpRet |            4.92 |
|          MaxEpRet |        1.16e+03 |
|          MinEpRet |        1.15e+03 |
|  AverageTestEpRet |             732 |
|      StdTestEpRet |             665 |
|      MaxTestEpRet |        1.09e+03 |
|      MinTestEpRet |            -688 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         7.2e+04 |
|     AverageQ1Vals |            41.1 |
|         StdQ1Vals |            26.7 |
|         MaxQ1Vals |            78.6 |
|         MinQ1Vals |           -30.6 |
|     AverageQ2Vals |            

t=84000, 64.1673731803894s
t=84200, 11.674996614456177s
t=84400, 11.739696502685547s
t=84600, 11.8636634349823s
t=84800, 11.866724967956543s
t=85000, 11.713064432144165s
t=85200, 11.59734296798706s
t=85400, 11.925532341003418s
t=85600, 11.728787899017334s
t=85800, 11.770269393920898s
---------------------------------------
|             Epoch |              43 |
|      AverageEpRet |        1.25e+03 |
|          StdEpRet |            28.2 |
|          MaxEpRet |        1.28e+03 |
|          MinEpRet |        1.22e+03 |
|  AverageTestEpRet |        1.35e+03 |
|      StdTestEpRet |            33.1 |
|      MaxTestEpRet |        1.41e+03 |
|      MinTestEpRet |        1.31e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         8.6e+04 |
|     AverageQ1Vals |            52.7 |
|         StdQ1Vals |            34.3 |
|         MaxQ1Vals |            92.9 |
|         MinQ1Vals |           -61.7 |
|     AverageQ2Vals |            52

t=98000, 65.62068486213684s
t=98200, 11.983031749725342s
t=98400, 12.021301984786987s
t=98600, 12.74391770362854s
t=98800, 11.941246032714844s
t=99000, 12.157703161239624s
t=99200, 12.006630420684814s
t=99400, 12.41350269317627s
t=99600, 12.02914047241211s
t=99800, 12.0037362575531s
---------------------------------------
|             Epoch |              50 |
|      AverageEpRet |        1.27e+03 |
|          StdEpRet |            7.73 |
|          MaxEpRet |        1.28e+03 |
|          MinEpRet |        1.27e+03 |
|  AverageTestEpRet |        1.26e+03 |
|      StdTestEpRet |            20.1 |
|      MaxTestEpRet |        1.29e+03 |
|      MinTestEpRet |        1.23e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+05 |
|     AverageQ1Vals |              66 |
|         StdQ1Vals |            35.9 |
|         MaxQ1Vals |             104 |
|         MinQ1Vals |           -68.1 |
|     AverageQ2Vals |              6

In [9]:
args = {'env': 'HalfCheetahMuJoCoEnv-v0', 'hid': 256, 'l': 2, 
        'max_hist_len': 10,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetahMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x00000211BB789A88>":	{
            "epoch_dict":	{},
           



t=400, 0.11865973472595215s
t=600, 0.11070418357849121s
t=800, 0.10870981216430664s
t=1000, 0.12569665908813477s
t=1200, 12.651562452316284s
t=1400, 12.621985673904419s
t=1600, 13.114497184753418s
t=1800, 13.223816394805908s
options= 
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -155 |
|          StdEpRet |            48.2 |
|          MaxEpRet |            -107 |
|          MinEpRet |            -203 |
|  AverageTestEpRet |            -162 |
|      StdTestEpRet |             144 |
|      MaxTestEpRet |            6.08 |
|      MinTestEpRet |            -366 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          0.0151 |
|         StdQ1Vals |           0.762 |
|         MaxQ1Vals |            5.11 |
|         MinQ1Vals |           -4.61 |
|     AverageQ2Vals |          0.0155 |
|         StdQ2Vals |           0.761 |
|    

t=14000, 68.73369574546814s
t=14200, 13.695749998092651s
t=14400, 13.748476028442383s
t=14600, 13.755351066589355s
t=14800, 13.817899942398071s
t=15000, 13.494683504104614s
t=15200, 13.957453966140747s
t=15400, 13.676370859146118s
t=15600, 14.088616371154785s
t=15800, 13.880437135696411s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |            -293 |
|          StdEpRet |             255 |
|          MaxEpRet |           -38.2 |
|          MinEpRet |            -549 |
|  AverageTestEpRet |           -36.7 |
|      StdTestEpRet |             105 |
|      MaxTestEpRet |            9.85 |
|      MinTestEpRet |            -352 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |           0.178 |
|         StdQ1Vals |            2.83 |
|         MaxQ1Vals |            48.4 |
|         MinQ1Vals |            -8.6 |
|     AverageQ2Vals |          

t=28000, 68.77778744697571s
t=28200, 13.829720258712769s
t=28400, 14.267233848571777s
t=28600, 14.134040594100952s
t=28800, 13.633811235427856s
t=29000, 14.001608848571777s
t=29200, 13.703376054763794s
t=29400, 14.062867879867554s
t=29600, 14.858930587768555s
t=29800, 14.447137117385864s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |           -3.53 |
|          StdEpRet |            6.39 |
|          MaxEpRet |            2.86 |
|          MinEpRet |           -9.92 |
|  AverageTestEpRet |            -101 |
|      StdTestEpRet |             161 |
|      MaxTestEpRet |            27.8 |
|      MinTestEpRet |            -413 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |           -1.42 |
|         StdQ1Vals |            3.44 |
|         MaxQ1Vals |            32.7 |
|         MinQ1Vals |           -25.3 |
|     AverageQ2Vals |          

t=42000, 68.39479398727417s
t=42200, 13.980169296264648s
t=42400, 14.06415867805481s
t=42600, 13.580034255981445s
t=42800, 13.761409997940063s
t=43000, 14.171798467636108s
t=43200, 14.06843900680542s
t=43400, 13.647061347961426s
t=43600, 14.135332584381104s
t=43800, 14.07827377319336s
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |           -18.9 |
|          StdEpRet |            14.9 |
|          MaxEpRet |           -4.05 |
|          MinEpRet |           -33.8 |
|  AverageTestEpRet |           -6.17 |
|      StdTestEpRet |            43.2 |
|      MaxTestEpRet |              14 |
|      MinTestEpRet |            -134 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |           -2.87 |
|         StdQ1Vals |            4.79 |
|         MaxQ1Vals |            79.5 |
|         MinQ1Vals |           -33.6 |
|     AverageQ2Vals |           -2

t=56000, 68.12038254737854s
t=56200, 14.012929439544678s
t=56400, 14.174542665481567s
t=56600, 13.894942998886108s
t=56800, 13.9121835231781s
t=57000, 13.838870286941528s
t=57200, 13.763038635253906s
t=57400, 13.813132047653198s
t=57600, 13.919102191925049s
t=57800, 14.066871643066406s
---------------------------------------
|             Epoch |              29 |
|      AverageEpRet |            11.5 |
|          StdEpRet |            10.4 |
|          MaxEpRet |            21.9 |
|          MinEpRet |            1.14 |
|  AverageTestEpRet |           -72.6 |
|      StdTestEpRet |             188 |
|      MaxTestEpRet |            7.95 |
|      MinTestEpRet |            -637 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.8e+04 |
|     AverageQ1Vals |           -4.14 |
|         StdQ1Vals |            5.56 |
|         MaxQ1Vals |            96.6 |
|         MinQ1Vals |             -39 |
|     AverageQ2Vals |           -

KeyboardInterrupt: 

In [7]:
args = {'env': 'HalfCheetahMuJoCoEnv-v0', 'hid': 256, 'l': 2, 
        'max_hist_len': 10,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'exp_name': 'lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetahMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist",
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x00000211BB6EF148>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_POMDP_Half



[32;1m
Number of parameters: 	 pi: 78854, 	 q1: 78977, 	 q2: 78977
[0m
options= 
t=0, 0.0s
t=200, 0.10571956634521484s
t=400, 0.10697269439697266s
t=600, 0.10924196243286133s
t=800, 0.10895371437072754s
t=1000, 0.10568547248840332s
t=1200, 12.285820722579956s
t=1400, 11.963626384735107s
t=1600, 12.215986013412476s
t=1800, 12.665575981140137s
options= 
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -261 |
|          StdEpRet |            43.1 |
|          MaxEpRet |            -218 |
|          MinEpRet |            -304 |
|  AverageTestEpRet |           -5.97 |
|      StdTestEpRet |            7.49 |
|      MaxTestEpRet |            2.31 |
|      MinTestEpRet |           -26.2 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.211 |
|         StdQ1Vals |           0.751 |
|         MaxQ1Vals |            5.68 |
|   

t=14000, 65.18557667732239s
t=14200, 13.588233709335327s
t=14400, 13.560912370681763s
t=14600, 13.602149724960327s
t=14800, 13.601069450378418s
t=15000, 13.717743158340454s
t=15200, 13.554569959640503s
t=15400, 13.374191045761108s
t=15600, 13.689148426055908s
t=15800, 13.758216619491577s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |           -39.2 |
|          StdEpRet |            25.8 |
|          MaxEpRet |           -13.4 |
|          MinEpRet |             -65 |
|  AverageTestEpRet |           -57.6 |
|      StdTestEpRet |             136 |
|      MaxTestEpRet |            7.75 |
|      MinTestEpRet |            -464 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |          -0.144 |
|         StdQ1Vals |            3.37 |
|         MaxQ1Vals |            21.1 |
|         MinQ1Vals |           -7.04 |
|     AverageQ2Vals |          

t=28000, 66.67970371246338s
t=28200, 13.634441375732422s
t=28400, 13.705376386642456s
t=28600, 13.710794925689697s
t=28800, 13.304603815078735s
t=29000, 14.162891149520874s
t=29200, 14.346294641494751s
t=29400, 13.725536584854126s
t=29600, 13.98723292350769s
t=29800, 14.637090682983398s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |            3.39 |
|          StdEpRet |             7.3 |
|          MaxEpRet |            10.7 |
|          MinEpRet |           -3.92 |
|  AverageTestEpRet |           -50.3 |
|      StdTestEpRet |             107 |
|      MaxTestEpRet |            17.1 |
|      MinTestEpRet |            -331 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |           -1.44 |
|         StdQ1Vals |            2.46 |
|         MaxQ1Vals |            24.6 |
|         MinQ1Vals |           -9.56 |
|     AverageQ2Vals |           

t=42000, 82.9111065864563s
t=42200, 13.912367105484009s
t=42400, 17.5684916973114s
t=42600, 15.246775388717651s
t=42800, 15.20763111114502s
t=43000, 15.35141634941101s
t=43200, 15.800639152526855s
t=43400, 18.30108904838562s
t=43600, 14.979574203491211s
t=43800, 15.444100379943848s
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |            -114 |
|          StdEpRet |            71.6 |
|          MaxEpRet |           -42.9 |
|          MinEpRet |            -186 |
|  AverageTestEpRet |            -156 |
|      StdTestEpRet |            27.3 |
|      MaxTestEpRet |            -126 |
|      MinTestEpRet |            -203 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |           -1.66 |
|         StdQ1Vals |            5.15 |
|         MaxQ1Vals |             114 |
|         MinQ1Vals |           -17.4 |
|     AverageQ2Vals |           -1.66

t=56000, 80.46535539627075s
t=56200, 15.647200107574463s
t=56400, 14.814730644226074s
t=56600, 15.06778335571289s
t=56800, 15.08118748664856s
t=57000, 15.309653520584106s
t=57200, 14.60860276222229s
t=57400, 14.774792194366455s
t=57600, 14.610618829727173s
t=57800, 14.559203624725342s
---------------------------------------
|             Epoch |              29 |
|      AverageEpRet |            -104 |
|          StdEpRet |            31.2 |
|          MaxEpRet |           -73.1 |
|          MinEpRet |            -135 |
|  AverageTestEpRet |           -50.7 |
|      StdTestEpRet |            34.5 |
|      MaxTestEpRet |           -8.45 |
|      MinTestEpRet |            -120 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.8e+04 |
|     AverageQ1Vals |           -2.78 |
|         StdQ1Vals |            10.2 |
|         MaxQ1Vals |             343 |
|         MinQ1Vals |           -29.4 |
|     AverageQ2Vals |           -2

KeyboardInterrupt: 

In [15]:
args = {'env': 'HalfCheetahMuJoCoEnv-v0', 'hid': 256, 'l': 2, 
        'max_hist_len': 10,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'exp_name': 'lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist\lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetahMuJoCoEnv-v0",
    "epochs":	50,
    "exp_name":	"lstm_td3_POMDP_HalfCheetahMuJoCoEnv_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist",
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000002143B145088>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_POMDP_Half

t=8000, 54.17809844017029s
t=8200, 9.889550924301147s
t=8400, 9.823725700378418s
t=8600, 9.927450180053711s
t=8800, 9.884562492370605s
t=9000, 9.976317882537842s
t=9200, 9.985293626785278s
t=9400, 9.813753366470337s
t=9600, 9.859630346298218s
t=9800, 10.012221574783325s
---------------------------------------
|             Epoch |               5 |
|      AverageEpRet |            -573 |
|          StdEpRet |              25 |
|          MaxEpRet |            -548 |
|          MinEpRet |            -598 |
|  AverageTestEpRet |            -589 |
|      StdTestEpRet |            2.78 |
|      MaxTestEpRet |            -585 |
|      MinTestEpRet |            -594 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+04 |
|     AverageQ1Vals |           -7.15 |
|         StdQ1Vals |            5.89 |
|         MaxQ1Vals |              34 |
|         MinQ1Vals |           -15.7 |
|     AverageQ2Vals |           -7.15 |
|        

t=22000, 54.3057587146759s
t=22200, 9.697064638137817s
t=22400, 9.750920295715332s
t=22600, 9.836692333221436s
t=22800, 9.8715980052948s
t=23000, 10.198723077774048s
t=23200, 10.816071510314941s
t=23400, 11.225977897644043s
t=23600, 9.935425996780396s
t=23800, 9.776851415634155s
---------------------------------------
|             Epoch |              12 |
|      AverageEpRet |            -540 |
|          StdEpRet |           0.309 |
|          MaxEpRet |            -540 |
|          MinEpRet |            -540 |
|  AverageTestEpRet |            -595 |
|      StdTestEpRet |            9.34 |
|      MaxTestEpRet |            -585 |
|      MinTestEpRet |            -614 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.4e+04 |
|     AverageQ1Vals |           -21.4 |
|         StdQ1Vals |            2.54 |
|         MaxQ1Vals |            5.43 |
|         MinQ1Vals |           -55.4 |
|     AverageQ2Vals |           -21.4 |


t=36000, 49.63300895690918s
t=36200, 12.254225015640259s
t=36400, 9.903542518615723s
t=36600, 10.465967655181885s
t=36800, 9.322757720947266s
t=37000, 9.460670709609985s
t=37200, 10.961878061294556s
t=37400, 10.18478798866272s
t=37600, 10.335888147354126s
t=37800, 10.56775975227356s
---------------------------------------
|             Epoch |              19 |
|      AverageEpRet |            -544 |
|          StdEpRet |            2.96 |
|          MaxEpRet |            -541 |
|          MinEpRet |            -547 |
|  AverageTestEpRet |            -589 |
|      StdTestEpRet |            2.73 |
|      MaxTestEpRet |            -585 |
|      MinTestEpRet |            -593 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.8e+04 |
|     AverageQ1Vals |           -30.3 |
|         StdQ1Vals |            2.36 |
|         MaxQ1Vals |         -0.0954 |
|         MinQ1Vals |           -72.2 |
|     AverageQ2Vals |           -30.

t=50000, 55.394856214523315s
t=50200, 10.00221037864685s
t=50400, 10.371289253234863s
t=50600, 9.941384315490723s
t=50800, 9.96933650970459s
t=51000, 9.971359729766846s
t=51200, 10.021199226379395s
t=51400, 9.985265016555786s
t=51600, 10.102978944778442s
t=51800, 9.889549970626831s
---------------------------------------
|             Epoch |              26 |
|      AverageEpRet |            -542 |
|          StdEpRet |           0.273 |
|          MaxEpRet |            -542 |
|          MinEpRet |            -543 |
|  AverageTestEpRet |            -590 |
|      StdTestEpRet |            4.54 |
|      MaxTestEpRet |            -585 |
|      MinTestEpRet |            -600 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.2e+04 |
|     AverageQ1Vals |           -37.5 |
|         StdQ1Vals |            7.41 |
|         MaxQ1Vals |           -6.25 |
|         MinQ1Vals |            -105 |
|     AverageQ2Vals |           -37.5

t=64000, 62.49385857582092s
t=64200, 11.821382522583008s
t=64400, 12.151501178741455s
t=64600, 11.519192218780518s
t=64800, 11.83335018157959s
t=65000, 11.677768230438232s
t=65200, 11.597982168197632s
t=65400, 11.639869451522827s
t=65600, 11.776503086090088s
t=65800, 11.554096221923828s
---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |            -546 |
|          StdEpRet |            3.31 |
|          MaxEpRet |            -543 |
|          MinEpRet |            -550 |
|  AverageTestEpRet |            -591 |
|      StdTestEpRet |            7.59 |
|      MaxTestEpRet |            -584 |
|      MinTestEpRet |            -606 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.6e+04 |
|     AverageQ1Vals |           -41.4 |
|         StdQ1Vals |            5.44 |
|         MaxQ1Vals |           -11.9 |
|         MinQ1Vals |            -130 |
|     AverageQ2Vals |           

t=78000, 57.96589756011963s
t=78200, 10.52383017539978s
t=78400, 12.119641542434692s
t=78600, 10.364279747009277s
t=78800, 10.279531240463257s
t=79000, 10.689138412475586s
t=79200, 11.51691484451294s
t=79400, 12.346285104751587s
t=79600, 12.053762197494507s
t=79800, 12.565858602523804s
---------------------------------------
|             Epoch |              40 |
|      AverageEpRet |            -559 |
|          StdEpRet |            10.9 |
|          MaxEpRet |            -548 |
|          MinEpRet |            -570 |
|  AverageTestEpRet |            -590 |
|      StdTestEpRet |            3.38 |
|      MaxTestEpRet |            -585 |
|      MinTestEpRet |            -595 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           8e+04 |
|     AverageQ1Vals |           -44.2 |
|         StdQ1Vals |            4.12 |
|         MaxQ1Vals |           -12.4 |
|         MinQ1Vals |            -146 |
|     AverageQ2Vals |           -

KeyboardInterrupt: 

In [None]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 10,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':True,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'exp_name': 'lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist\lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetah-v2",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_FreezeHist",
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x00000234D1147BC8>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_



[32;1m
Number of parameters: 	 pi: 82310, 	 q1: 82433, 	 q2: 82433
[0m
t=0, 0.0s
t=200, 0.025961637496948242s
t=400, 0.024930953979492188s
t=600, 0.024905920028686523s
t=800, 0.026926517486572266s
t=1000, 0.025961875915527344s
t=1200, 17.0224506855011s
t=1400, 20.037421703338623s
t=1600, 12.372912883758545s
t=1800, 11.962013959884644s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -413 |
|          StdEpRet |             135 |
|          MaxEpRet |            -278 |
|          MinEpRet |            -549 |
|  AverageTestEpRet |            -600 |
|      StdTestEpRet |            1.01 |
|      MaxTestEpRet |            -598 |
|      MinTestEpRet |            -601 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.516 |
|         StdQ1Vals |            0.72 |
|         MaxQ1Vals |            3.08 |
|         MinQ1Vals |

t=14000, 65.35526752471924s
t=14200, 14.52016568183899s
t=14400, 21.476115465164185s
t=14600, 14.947030782699585s
t=14800, 12.612273216247559s
t=15000, 14.233938932418823s
t=15200, 12.720013618469238s
t=15400, 12.992257595062256s
t=15600, 15.818676948547363s
t=15800, 13.068861484527588s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |            -400 |
|          StdEpRet |            32.4 |
|          MaxEpRet |            -367 |
|          MinEpRet |            -432 |
|  AverageTestEpRet |            -285 |
|      StdTestEpRet |            45.4 |
|      MaxTestEpRet |            -192 |
|      MinTestEpRet |            -347 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |           -5.82 |
|         StdQ1Vals |            4.48 |
|         MaxQ1Vals |            19.7 |
|         MinQ1Vals |           -24.6 |
|     AverageQ2Vals |           

t=28000, 59.15977716445923s
t=28200, 13.08301568031311s
t=28400, 12.931421756744385s
t=28600, 12.891557931900024s
t=28800, 13.01319408416748s
t=29000, 13.006834506988525s
t=29200, 12.683116436004639s
t=29400, 12.89551591873169s
t=29600, 12.778827905654907s
t=29800, 13.155816555023193s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |             354 |
|          StdEpRet |            36.4 |
|          MaxEpRet |             390 |
|          MinEpRet |             317 |
|  AverageTestEpRet |            62.1 |
|      StdTestEpRet |             297 |
|      MaxTestEpRet |             360 |
|      MinTestEpRet |            -384 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |           -13.3 |
|         StdQ1Vals |            8.94 |
|         MaxQ1Vals |            40.4 |
|         MinQ1Vals |           -54.8 |
|     AverageQ2Vals |           -1

t=42000, 99.43541026115417s
t=42200, 16.607593297958374s
t=42400, 23.555211544036865s
t=42600, 16.642499208450317s
t=42800, 15.467641830444336s
t=43000, 20.31551718711853s
t=43200, 19.146821975708008s
t=43400, 16.521790504455566s
t=43600, 15.666109323501587s
t=43800, 17.338636875152588s
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |             572 |
|          StdEpRet |            45.5 |
|          MaxEpRet |             617 |
|          MinEpRet |             526 |
|  AverageTestEpRet |             422 |
|      StdTestEpRet |            80.5 |
|      MaxTestEpRet |             550 |
|      MinTestEpRet |             268 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |           -16.4 |
|         StdQ1Vals |            13.5 |
|         MaxQ1Vals |            34.8 |
|         MinQ1Vals |           -93.9 |
|     AverageQ2Vals |           

t=56000, 63.12121343612671s
t=56200, 13.098974227905273s
t=56400, 13.04810881614685s
t=56600, 13.232644319534302s
t=56800, 13.12687063217163s
t=57000, 13.14089584350586s
t=57200, 13.012171983718872s
t=57400, 13.482976913452148s
t=57600, 13.251533508300781s
t=57800, 13.008240461349487s
---------------------------------------
|             Epoch |              29 |
|      AverageEpRet |             809 |
|          StdEpRet |              31 |
|          MaxEpRet |             840 |
|          MinEpRet |             778 |
|  AverageTestEpRet |             628 |
|      StdTestEpRet |            60.8 |
|      MaxTestEpRet |             752 |
|      MinTestEpRet |             555 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.8e+04 |
|     AverageQ1Vals |           -20.8 |
|         StdQ1Vals |            15.7 |
|         MaxQ1Vals |            24.3 |
|         MinQ1Vals |            -126 |
|     AverageQ2Vals |           -2

t=70000, 68.33365321159363s
t=70200, 14.206980466842651s
t=70400, 16.992563486099243s
t=70600, 16.19969940185547s
t=70800, 16.48891592025757s
t=71000, 15.045526504516602s
t=71200, 18.149317502975464s
t=71400, 15.178437232971191s
t=71600, 13.60858702659607s
t=71800, 14.675755977630615s
---------------------------------------
|             Epoch |              36 |
|      AverageEpRet |             778 |
|          StdEpRet |          0.0434 |
|          MaxEpRet |             778 |
|          MinEpRet |             778 |
|  AverageTestEpRet |             657 |
|      StdTestEpRet |             103 |
|      MaxTestEpRet |             766 |
|      MinTestEpRet |             387 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         7.2e+04 |
|     AverageQ1Vals |           -20.6 |
|         StdQ1Vals |            19.1 |
|         MaxQ1Vals |            19.3 |
|         MinQ1Vals |            -137 |
|     AverageQ2Vals |           -2

t=84000, 65.16674447059631s
t=84200, 14.79045033454895s
t=84400, 14.180081367492676s
t=84600, 14.360602140426636s
t=84800, 14.211995601654053s
t=85000, 14.39453673362732s
t=85200, 14.109243154525757s
t=85400, 14.431441307067871s
t=85600, 14.566020011901855s
t=85800, 14.747565746307373s
---------------------------------------
|             Epoch |              43 |
|      AverageEpRet |             902 |
|          StdEpRet |            10.7 |
|          MaxEpRet |             913 |
|          MinEpRet |             891 |
|  AverageTestEpRet |             776 |
|      StdTestEpRet |            52.5 |
|      MaxTestEpRet |             876 |
|      MinTestEpRet |             692 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         8.6e+04 |
|     AverageQ1Vals |           -18.3 |
|         StdQ1Vals |            22.5 |
|         MaxQ1Vals |              23 |
|         MinQ1Vals |            -148 |
|     AverageQ2Vals |           -