In [1]:
from copy import deepcopy
import numpy as np
import torch
from torch.optim import Adam
import pybulletgym
import gym
import time
import spinup.algos.pytorch.lstm_ddpg.core as core
from spinup.utils.logx import EpochLogger
import itertools

In [2]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [3]:
class POMDPWrapper(gym.ObservationWrapper):
    def __init__(self, env_name):
        super().__init__(gym.make(env_name))
        
        # Remove velocity info
        # OpenAIGym
        #  1. MuJoCo
        if env_name == "HalfCheetah-v3" or env_name == "HalfCheetah-v2":
            self.remain_obs_idx = np.arange(0, 8)
        elif env_name == "Ant-v3" or env_name == "Ant-v2":
            self.remain_obs_idx = list(np.arange(0, 13)) + list(np.arange(27, 111))
        elif env_name == 'Walker2d-v3' or env_name == "Walker2d-v2":
            self.remain_obs_idx = np.arange(0, 8)
        elif env_name == 'Hopper-v3' or env_name == "Hopper-v2":
            self.remain_obs_idx = np.arange(0, 5)
        elif env_name == "InvertedPendulum-v2":
            self.remain_obs_idx = np.arange(0, 2)
        elif env_name == "InvertedDoublePendulum-v2":
            self.remain_obs_idx = list(np.arange(0, 5)) + list(np.arange(8, 11))
        elif env_name == "Swimmer-v3" or env_name == "Swimmer-v2":
            self.remain_obs_idx = np.arange(0, 3)
        elif env_name == "Thrower-v2":
            self.remain_obs_idx = list(np.arange(0, 7)) + list(np.arange(14, 23))
        elif env_name == "Striker-v2":
            self.remain_obs_idx = list(np.arange(0, 7)) + list(np.arange(14, 23))
        elif env_name == "Pusher-v2":
            self.remain_obs_idx = list(np.arange(0, 7)) + list(np.arange(14, 23))
        elif env_name == "Reacher-v2":
            self.remain_obs_idx = list(np.arange(0, 6)) + list(np.arange(8, 11))
        elif env_name == 'Humanoid-v3' or env_name == "Humanoid-v2":
            self.remain_obs_idx = list(np.arange(0, 22)) + list(np.arange(45, 185)) + list(np.arange(269, 376))
        elif env_name == 'HumanoidStandup-v2':
            self.remain_obs_idx = list(np.arange(0, 22)) + list(np.arange(45, 185)) + list(np.arange(269, 376))
        # PyBulletGym
        #  1. MuJoCo
        elif env_name == 'HalfCheetahMuJoCoEnv-v0':
            self.remain_obs_idx = np.arange(0, 8)
        elif env_name == 'AntMuJoCoEnv-v0':
            self.remain_obs_idx = list(np.arange(0, 13)) + list(np.arange(27, 111))
        elif env_name == 'Walker2DMuJoCoEnv-v0':
            self.remain_obs_idx = np.arange(0, 8)
        elif env_name == 'HopperMuJoCoEnv-v0':
            self.remain_obs_idx = np.arange(0, 7)
        elif env_name == 'InvertedPendulumMuJoCoEnv-v0':
            self.remain_obs_idx = np.arange(0, 3)
        elif env_name == 'InvertedDoublePendulumMuJoCoEnv-v0':
            self.remain_obs_idx = list(np.arange(0, 5)) + list(np.arange(8, 11))
        #  2. Roboschool
        elif env_name == 'HalfCheetahPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,26)) - set(np.arange(3,6)))
        elif env_name ==  'AntPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,28)) - set(np.arange(3,6)))
        elif env_name == 'Walker2DPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,22)) - set(np.arange(3,6)))
        elif env_name == 'HopperPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,15)) - set(np.arange(3,6)))
        elif env_name == 'InvertedPendulumPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,5)) - set([1,4]))
        elif env_name == 'InvertedDoublePendulumPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,9)) - set([1,5,8]))
        elif env_name == 'ReacherPyBulletEnv-v0':
            self.remain_obs_idx = list(set(np.arange(0,9)) - set([6,8]))
        else:
            raise ValueError('POMDP for {} is not defined!'.format(env_name))
            
        # Redefine observation_space
        obs_low = np.array([-np.inf for i in range(len(self.remain_obs_idx))], dtype="float32")
        obs_high = np.array([np.inf for i in range(len(self.remain_obs_idx))], dtype="float32")
        self.observation_space = gym.spaces.Box(obs_low, obs_high)
        
    def observation(self, obs):
        return obs.flatten()[self.remain_obs_idx]
    

In [4]:
class ReplayBuffer:
    """
    A simple FIFO experience replay buffer for agents.
    """

    def __init__(self, obs_dim, act_dim, max_size):
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.max_size = max_size
        self.obs_buf = np.zeros(core.combined_shape(max_size, obs_dim), dtype=np.float32)
        self.obs2_buf = np.zeros(core.combined_shape(max_size, obs_dim), dtype=np.float32)
        self.act_buf = np.zeros(core.combined_shape(max_size, act_dim), dtype=np.float32)
        self.rew_buf = np.zeros(max_size, dtype=np.float32)
        self.done_buf = np.zeros(max_size, dtype=np.float32)
        self.ptr, self.size = 0, 0

    def store(self, obs, act, rew, next_obs, done):
        self.obs_buf[self.ptr] = obs
        self.act_buf[self.ptr] = act
        self.rew_buf[self.ptr] = rew
        self.obs2_buf[self.ptr] = list(next_obs)
        self.done_buf[self.ptr] = done
        self.ptr = (self.ptr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)

    def sample_batch(self, batch_size=32):
        idxs = np.random.randint(0, self.size, size=batch_size)
        batch = dict(obs=self.obs_buf[idxs],
                     obs2=self.obs2_buf[idxs],
                     act=self.act_buf[idxs],
                     rew=self.rew_buf[idxs],
                     done=self.done_buf[idxs])
        return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()}
    
    def sample_batch_with_history(self, batch_size=32, max_hist_len=100):
        idxs = np.random.randint(max_hist_len, self.size, size=batch_size)
        # History 
        if max_hist_len == 0:
            hist_obs = np.zeros([batch_size, 1, self.obs_dim])
            hist_act = np.zeros([batch_size, 1, self.act_dim])
            hist_obs2 = np.zeros([batch_size, 1, self.obs_dim])
            hist_act2 = np.zeros([batch_size, 1, self.act_dim])
            hist_rew = np.zeros([batch_size, 1])
            hist_done = np.zeros([batch_size, 1])
            hist_len = np.zeros(batch_size)
#             hist_msk = np.tile((hist_len!=0).astype(float).reshape([-1,1]), [1, 12]).shape
        else:
            hist_obs = np.zeros([batch_size, max_hist_len, self.obs_dim])
            hist_act = np.zeros([batch_size, max_hist_len, self.act_dim])
            hist_obs2 = np.zeros([batch_size, max_hist_len, self.obs_dim])
            hist_act2 = np.zeros([batch_size, max_hist_len, self.act_dim])
            hist_rew = np.zeros([batch_size, max_hist_len])
            hist_done = np.zeros([batch_size, max_hist_len])
            hist_len = max_hist_len * np.ones(batch_size)
            for hist_i in range(max_hist_len):
                hist_obs[:, -1-hist_i, :] = self.obs_buf[idxs-hist_i-1, :]
                hist_act[:, -1-hist_i, :] = self.act_buf[idxs-hist_i-1, :]
                hist_obs2[:, -1-hist_i, :] = self.obs2_buf[idxs-hist_i-1, :]
                hist_act2[:, -1-hist_i, :] = self.act_buf[idxs-hist_i, :]  # include a_t
                hist_rew[:, -1-hist_i] = self.rew_buf[idxs-hist_i-1]
                hist_done[:, -1-hist_i] = self.done_buf[idxs-hist_i-1]
            # If there is done in the backward experiences, only consider the experiences after the last done.
            for batch_i in range(batch_size):
                done_idxs_exclude_last_exp = np.where(hist_done[batch_i][:-1] == 1)  # Exclude last experience
                # If exist done
                if done_idxs_exclude_last_exp[0].size != 0:
                    largest_done_id = done_idxs_exclude_last_exp[0][-1]
                    hist_len[batch_i] = max_hist_len - (largest_done_id+1)

                    # Only keep experiences after the last done
                    obs_keep_part = np.copy(hist_obs[batch_i, largest_done_id+1:, :])
                    act_keep_part = np.copy(hist_act[batch_i, largest_done_id+1:, :])
                    obs2_keep_part = np.copy(hist_obs2[batch_i, largest_done_id+1:, :])
                    act2_keep_part = np.copy(hist_act2[batch_i, largest_done_id+1:, :])
                    rew_keep_part = np.copy(hist_rew[batch_i, largest_done_id+1:])
                    done_keep_part = np.copy(hist_done[batch_i, largest_done_id+1:])

                    # Set to 0 to make sure all experiences are at the beginning
                    hist_obs[batch_i] = np.zeros([max_hist_len, self.obs_dim])
                    hist_act[batch_i] = np.zeros([max_hist_len, self.act_dim])
                    hist_obs2[batch_i] = np.zeros([max_hist_len, self.obs_dim])
                    hist_act2[batch_i] = np.zeros([max_hist_len, self.act_dim])
                    hist_rew[batch_i] = np.zeros([max_hist_len])
                    hist_done[batch_i] = np.zeros([max_hist_len])

                    # Move kept experiences to the start of the segment
                    hist_obs[batch_i, :max_hist_len-(largest_done_id+1), :] = obs_keep_part
                    hist_act[batch_i, :max_hist_len-(largest_done_id+1), :] = act_keep_part
                    hist_obs2[batch_i, :max_hist_len-(largest_done_id+1), :] = obs2_keep_part
                    hist_act2[batch_i, :max_hist_len-(largest_done_id+1), :] = act2_keep_part
                    hist_rew[batch_i, :max_hist_len-(largest_done_id+1)] = rew_keep_part
                    hist_done[batch_i, :max_hist_len-(largest_done_id+1)] = done_keep_part
        # 
        batch = dict(obs=self.obs_buf[idxs],
                     obs2=self.obs2_buf[idxs],
                     act=self.act_buf[idxs],
                     rew=self.rew_buf[idxs],
                     done=self.done_buf[idxs],
                     hist_obs=hist_obs,
                     hist_act=hist_act,
                     hist_obs2=hist_obs2,
                     hist_act2=hist_act2,
                     hist_rew=hist_rew,
                     hist_done=hist_done,
                     hist_len=hist_len)
        return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()}
        

In [5]:
class MLPCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, 
                 mem_pre_lstm_hid_sizes=(128,),
                 mem_lstm_hid_sizes=(128,),
                 cur_feature_hid_sizes=(128,),
                 post_comb_hid_sizes=(128,), mem_gate=True):
        super(MLPCritic, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.mem_gate = mem_gate
        #
        self.mem_pre_lstm_layers = nn.ModuleList()
        self.mem_lstm_layers = nn.ModuleList()
        
        self.mem_gate_layer = nn.ModuleList()
        
        self.cur_feature_layers = nn.ModuleList()
        self.post_combined_layers = nn.ModuleList()
        # Memory
        #    Pre-LSTM
        mem_pre_lstm_layer_size = [obs_dim+act_dim] + list(mem_pre_lstm_hid_sizes)
        for h in range(len(mem_pre_lstm_layer_size)-1):
            self.mem_pre_lstm_layers += [nn.Linear(mem_pre_lstm_layer_size[h], 
                                                   mem_pre_lstm_layer_size[h+1]),
                                         nn.ReLU()]
        #    LSTM
        self.mem_lstm_layer_sizes = [mem_pre_lstm_layer_size[-1]] + list(mem_lstm_hid_sizes)
        for h in range(len(self.mem_lstm_layer_sizes)-1):
            self.mem_lstm_layers += [nn.LSTM(self.mem_lstm_layer_sizes[h], self.mem_lstm_layer_sizes[h+1], batch_first=True)]
        
        #    Memeory Gate
        if self.mem_gate:
            self.mem_gate_layer += [nn.Linear(self.mem_lstm_layer_sizes[-1]+obs_dim+act_dim, self.mem_lstm_layer_sizes[-1]),
                                    nn.Sigmoid()]
        
        # Current Feature Extraction
        cur_feature_layer_size = [obs_dim+act_dim]+list(cur_feature_hid_sizes) 
        for h in range(len(cur_feature_layer_size)-1):
            self.cur_feature_layers += [nn.Linear(cur_feature_layer_size[h], cur_feature_layer_size[h+1]),
                                        nn.ReLU()]
        
        # Post-Combination 
        post_combined_layer_size = [self.mem_lstm_layer_sizes[-1]+cur_feature_layer_size[-1]]+list(post_comb_hid_sizes) + [1]
        for h in range(len(post_combined_layer_size)-2):
            self.post_combined_layers += [nn.Linear(post_combined_layer_size[h], post_combined_layer_size[h+1]), nn.ReLU()]
        self.post_combined_layers += [nn.Linear(post_combined_layer_size[-2], post_combined_layer_size[-1]), nn.Identity()]
    
    def forward(self, obs, act, hist_obs, hist_act, hist_seg_len):
        #
        tmp_hist_seg_len = deepcopy(hist_seg_len)
        tmp_hist_seg_len[hist_seg_len == 0] = 1
        
        x = torch.cat([hist_obs, hist_act], dim=-1)
        
        # Memory
        #    Pre-LSTM
        for layer in self.mem_pre_lstm_layers:
            x = layer(x)
        #    LSTM
        for layer in self.mem_lstm_layers:
             x, (lstm_hidden_state, lstm_cell_state) = layer(x)
        hist_out = torch.gather(x, 1, (tmp_hist_seg_len-1).view(-1,1).repeat(1, self.mem_lstm_layer_sizes[-1]).unsqueeze(1).long()).squeeze(1)
        hist_msk = (hist_seg_len != 0).float().view(-1,1).repeat(1, self.mem_lstm_layer_sizes[-1]).cuda()
        #   Memory Gate
        if self.mem_gate:
            memory_gate = torch.cat([hist_out*hist_msk, obs, act], dim=-1)
            for layer in self.mem_gate_layer:
                memory_gate = layer(memory_gate)
            
        # Current Feature Extraction
        x = torch.cat([obs, act], dim=-1)
        for layer in self.cur_feature_layers:
            x = layer(x)
        # Post-Combination
        if self.mem_gate:
            x = torch.cat([memory_gate*hist_out*hist_msk, x], dim=-1)
        else:
            x = torch.cat([hist_out*hist_msk, x], dim=-1)
        
        for layer in self.post_combined_layers:
            x = layer(x)
        return torch.squeeze(x, -1) # Critical to ensure q has right shape.

class MLPActor(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit, 
                 mem_pre_lstm_hid_sizes=(128,),
                 mem_lstm_hid_sizes=(128,),
                 cur_feature_hid_sizes=(128,),
                 post_comb_hid_sizes=(128,), mem_gate=True):
        super(MLPActor, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.act_limit = act_limit
        self.mem_gate = mem_gate
        #
        self.mem_pre_lstm_layers = nn.ModuleList()
        self.mem_lstm_layers = nn.ModuleList()
        
        self.mem_gate_layer = nn.ModuleList()
        
        self.cur_feature_layers = nn.ModuleList()
        self.post_combined_layers = nn.ModuleList()
        
        # Memory
        #    Pre-LSTM
        mem_pre_lstm_layer_size = [obs_dim+act_dim] + list(mem_pre_lstm_hid_sizes)
        for h in range(len(mem_pre_lstm_layer_size)-1):
            self.mem_pre_lstm_layers += [nn.Linear(mem_pre_lstm_layer_size[h], 
                                                   mem_pre_lstm_layer_size[h+1]),
                                         nn.ReLU()]
        #    LSTM
        self.mem_lstm_layer_sizes = [mem_pre_lstm_layer_size[-1]] + list(mem_lstm_hid_sizes)
        for h in range(len(self.mem_lstm_layer_sizes)-1):
            self.mem_lstm_layers += [nn.LSTM(self.mem_lstm_layer_sizes[h], self.mem_lstm_layer_sizes[h+1], batch_first=True)]
        
        #    Memeory Gate
        if self.mem_gate:
            self.mem_gate_layer += [nn.Linear(self.mem_lstm_layer_sizes[-1]+obs_dim, self.mem_lstm_layer_sizes[-1]),
                                    nn.Sigmoid()]
        
        # Current Feature Extraction
        cur_feature_layer_size = [obs_dim]+list(cur_feature_hid_sizes) 
        for h in range(len(cur_feature_layer_size)-1):
            self.cur_feature_layers += [nn.Linear(cur_feature_layer_size[h], cur_feature_layer_size[h+1]),
                                        nn.ReLU()]
        
        # Post-Combination 
        post_combined_layer_size = [self.mem_lstm_layer_sizes[-1]+cur_feature_layer_size[-1]]+list(post_comb_hid_sizes) + [act_dim]
        for h in range(len(post_combined_layer_size)-2):
            self.post_combined_layers += [nn.Linear(post_combined_layer_size[h], post_combined_layer_size[h+1]), nn.ReLU()]
        self.post_combined_layers += [nn.Linear(post_combined_layer_size[-2], post_combined_layer_size[-1]), nn.Tanh()]
    
    def forward(self, obs, hist_obs, hist_act, hist_seg_len):
        #
        tmp_hist_seg_len = deepcopy(hist_seg_len)
        tmp_hist_seg_len[hist_seg_len == 0] = 1
        
        x = torch.cat([hist_obs, hist_act], dim=-1)
        
        # Memory
        #    Pre-LSTM
        for layer in self.mem_pre_lstm_layers:
            x = layer(x)
        #    LSTM
        for layer in self.mem_lstm_layers:
             x, (lstm_hidden_state, lstm_cell_state) = layer(x)
        hist_out = torch.gather(x, 1, (tmp_hist_seg_len-1).view(-1,1).repeat(1, self.mem_lstm_layer_sizes[-1]).unsqueeze(1).long()).squeeze(1)
        hist_msk = (hist_seg_len != 0).float().view(-1,1).repeat(1, self.mem_lstm_layer_sizes[-1]).cuda()
        #   Memory Gate
        if self.mem_gate:
            memory_gate = torch.cat([hist_out*hist_msk, obs], dim=-1)
            for layer in self.mem_gate_layer:
                memory_gate = layer(memory_gate)
            
        # Current Feature Extraction
        x = obs
        for layer in self.cur_feature_layers:
            x = layer(x)
        # Post-Combination
        if self.mem_gate:
            x = torch.cat([memory_gate*hist_out*hist_msk, x], dim=-1)
        else:
            x = torch.cat([hist_out*hist_msk, x], dim=-1)
        
        for layer in self.post_combined_layers:
            x = layer(x)
        return self.act_limit * x

class MLPActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit=1, 
                 critic_mem_pre_lstm_hid_sizes=(128,),
                 critic_mem_lstm_hid_sizes=(128,),
                 critic_cur_feature_hid_sizes=(128,),
                 critic_post_comb_hid_sizes=(128,), critic_mem_gate=True,
                 actor_mem_pre_lstm_hid_sizes=(128,),
                 actor_mem_lstm_hid_sizes=(128,),
                 actor_cur_feature_hid_sizes=(128,),
                 actor_post_comb_hid_sizes=(128,), actor_mem_gate=True):
        super(MLPActorCritic, self).__init__()
        self.q1 = MLPCritic(obs_dim, act_dim, 
                            mem_pre_lstm_hid_sizes=critic_mem_pre_lstm_hid_sizes,
                            mem_lstm_hid_sizes=critic_mem_lstm_hid_sizes,
                            cur_feature_hid_sizes=critic_cur_feature_hid_sizes,
                            post_comb_hid_sizes=critic_post_comb_hid_sizes, mem_gate=critic_mem_gate)
        self.q2 = MLPCritic(obs_dim, act_dim, 
                            mem_pre_lstm_hid_sizes=critic_mem_pre_lstm_hid_sizes,
                            mem_lstm_hid_sizes=critic_mem_lstm_hid_sizes,
                            cur_feature_hid_sizes=critic_cur_feature_hid_sizes,
                            post_comb_hid_sizes=critic_post_comb_hid_sizes, mem_gate=critic_mem_gate)
        self.pi = MLPActor(obs_dim, act_dim, act_limit,
                           mem_pre_lstm_hid_sizes=actor_mem_pre_lstm_hid_sizes,
                           mem_lstm_hid_sizes=actor_mem_lstm_hid_sizes,
                           cur_feature_hid_sizes=actor_cur_feature_hid_sizes,
                           post_comb_hid_sizes=actor_post_comb_hid_sizes, mem_gate=actor_mem_gate)
    
    def act(self, obs, hist_obs=None, hist_act=None, hist_seg_len=None):
        if (hist_obs is None) or (hist_act is None) or (hist_seg_len is None):
            hist_obs = torch.zeros(1, 1, self.obs_dim).cuda()
            hist_act = torch.zeros(1, 1, self.act_dim).cuda()
            hist_seg_len = torch.zeros(1).cuda()
        with torch.no_grad():
            return self.pi(obs, hist_obs, hist_act, hist_seg_len).cpu().numpy() 
        

In [13]:
device = torch.device("cuda")
device

device(type='cuda')

In [8]:
device = "cuda"
torch.device(device)

device(type='cuda')

In [14]:
torch.rand([2,3]).to(device)

tensor([[0.2285, 0.5924, 0.4242],
        [0.5068, 0.9532, 0.4619]], device='cuda:0')

In [49]:
def td3(env_name, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 
        steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 
        polyak=0.995, pi_lr=1e-3, q_lr=1e-3, 
        batch_size=100, max_hist_len=100,
        start_steps=10000, 
        update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, 
        noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, 
        nonstationary_env = True,
        gravity_change_pattern = 'gravity_averagely_equal',
        partially_observable = False,
        freeze_hist_coding = False,
        logger_kwargs=dict(), save_freq=1):
    """
    Twin Delayed Deep Deterministic Policy Gradient (TD3)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of 
            observations as inputs, and ``q1`` and ``q2`` should accept a batch 
            of observations and a batch of actions as inputs. When called, 
            these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current 
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target 
            policy.

        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.

        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    # Wrapper environment if using POMDP
    if partially_observable == True:
        env, test_env = POMDPWrapper(env_name), POMDPWrapper(env_name)
    else:
        env, test_env = gym.make(env_name), gym.make(env_name)
    obs_dim = env.observation_space.shape[0]   
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    critic_mem_gate = False
    actor_mem_gate = False
    ac = MLPActorCritic( obs_dim, act_dim, act_limit, 
                         critic_mem_pre_lstm_hid_sizes=(128,),
                         critic_mem_lstm_hid_sizes=(128,),
                         critic_cur_feature_hid_sizes=(128,),
                         critic_post_comb_hid_sizes=(128,), critic_mem_gate=critic_mem_gate,
                         actor_mem_pre_lstm_hid_sizes=(128,),
                         actor_mem_lstm_hid_sizes=(128,),
                         actor_cur_feature_hid_sizes=(128,),
                         actor_post_comb_hid_sizes=(128,), actor_mem_gate=actor_mem_gate)
    ac_targ = deepcopy(ac)
    ac.cuda()
    ac_targ.cuda()
    
#     # Freeze hist coding
#     if freeze_hist_coding:
#         ac.q1.layers[0].requires_grad=False
#         ac.q2.layers[0].requires_grad=False
#         ac.pi.layers[0].requires_grad=False

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False
        
    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())
    
    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, max_size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n'%var_counts)

    # Set up function for computing TD3 Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']
        h_o, h_a, h_o2, h_a2, h_len = data['hist_obs'], data['hist_act'], data['hist_obs2'], data['hist_act2'], data['hist_len']

        q1 = ac.q1(o, a, h_o, h_a, h_len)
        q2 = ac.q2(o, a, h_o, h_a, h_len)

        # Bellman backup for Q functions
        with torch.no_grad():
            pi_targ = ac_targ.pi(o2, h_o2, h_a2, h_len)
            
            # Target policy smoothing
            epsilon = torch.randn_like(pi_targ) * target_noise
            epsilon = torch.clamp(epsilon, -noise_clip, noise_clip)
            a2 = pi_targ + epsilon
            a2 = torch.clamp(a2, -act_limit, act_limit)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2, h_o2, h_a2, h_len)
            q2_pi_targ = ac_targ.q2(o2, a2, h_o2, h_a2, h_len)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            q_pi_targ = q1_pi_targ
            backup = r + gamma * (1 - d) * q_pi_targ

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2
        loss_q = loss_q1

        # Useful info for logging
        loss_info = dict(Q1Vals=q1.detach().cpu().numpy(),
                         Q2Vals=q2.detach().cpu().numpy())

        return loss_q, loss_info

    # Set up function for computing TD3 pi loss
    def compute_loss_pi(data):
        o, h_o, h_a, h_len = data['obs'], data['hist_obs'], data['hist_act'], data['hist_len']
        q1_pi = ac.q1(o, ac.pi(o, h_o, h_a, h_len), h_o, h_a, h_len)
        return -q1_pi.mean()

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    q_optimizer = Adam(q_params, lr=q_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data, timer):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **loss_info)

        # Possibly update pi and target networks
        if timer % policy_delay == 0:

#             # Freeze Q-networks so you don't waste computational effort 
#             # computing gradients for them during the policy learning step.
#             for p in q_params:
#                 p.requires_grad = False

            # Next run one gradient descent step for pi.
            pi_optimizer.zero_grad()
            loss_pi = compute_loss_pi(data)
            loss_pi.backward()
            pi_optimizer.step()

#             # Unfreeze Q-networks so you can optimize it at next DDPG step.
#             for p in q_params:
#                 p.requires_grad = True

            # Record things
            logger.store(LossPi=loss_pi.item())

            # Finally, update target networks by polyak averaging.
            with torch.no_grad():
                for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                    # NB: We use an in-place operations "mul_", "add_" to update target
                    # params, as opposed to "mul" and "add", which would make new tensors.
                    p_targ.data.mul_(polyak)
                    p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, o_buff, a_buff, o_buff_len, noise_scale):
        h_o = torch.tensor(o_buff).view(1, o_buff.shape[0], o_buff.shape[1]).float().cuda()
        h_a = torch.tensor(a_buff).view(1, a_buff.shape[0], a_buff.shape[1]).float().cuda()
        h_l = torch.tensor([o_buff_len]).float().cuda()
        with torch.no_grad(): 
            a = ac.act(torch.as_tensor(o, dtype=torch.float32).view(1,-1).cuda(),
                       h_o, h_a, h_l).reshape(act_dim)
        a += noise_scale * np.random.randn(act_dim)
        if a.shape[0]!=act_dim:
            import pdb
            pdb.set_trace()
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
                
            if max_hist_len>0:
                o_buff = np.zeros([max_hist_len, obs_dim])
                a_buff = np.zeros([max_hist_len, act_dim])
                o_buff[0,:] = o
                o_buff_len = 0
            else:
                o_buff = np.zeros([1, obs_dim])
                a_buff = np.zeros([1, act_dim])
                o_buff_len = 0
                
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                a = get_action(o, o_buff, a_buff, o_buff_len, 0)
                o2, r, d, _ = test_env.step(a)
                
                ep_ret += r
                ep_len += 1
                # Add short history
                if max_hist_len != 0:
                    if o_buff_len == max_hist_len:
                        o_buff[:max_hist_len-1] = o_buff[1:]
                        a_buff[:max_hist_len-1] = a_buff[1:]
                        o_buff[max_hist_len-1] = list(o)
                        a_buff[max_hist_len-1] = list(a)
                    else:
                        if a.shape[0]!=act_dim:
                            import pdb
                            pdb.set_trace()
                        o_buff[o_buff_len+1-1] = list(o)
                        a_buff[o_buff_len+1-1] = list(a)
                        o_buff_len += 1
                o = o2
                
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
        
    if max_hist_len>0:
        o_buff = np.zeros([max_hist_len, obs_dim])
        a_buff = np.zeros([max_hist_len, act_dim])
        o_buff[0,:] = o
        o_buff_len = 0
    else:
        o_buff = np.zeros([1, obs_dim])
        a_buff = np.zeros([1, act_dim])
        o_buff_len = 0

    # Main loop: collect experience in env and update/log each epoch
    start_time = time.time()
    for t in range(total_steps):
        if t%200 == 0:
            end_time = time.time()
            print("t={}, {}s".format(t, end_time-start_time))
            start_time = end_time
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards, 
        # use the learned policy (with some noise, via act_noise). 
        if t > start_steps:
            a = get_action(o, o_buff, a_buff, o_buff_len, act_noise)
        else:
            a = env.action_space.sample()
        
        if nonstationary_env == True:
            gravity_cycle = 1000
            gravity_base = -9.81
            if gravity_change_pattern == 'gravity_averagely_equal':
                gravity = gravity_base * 1 / 2 * (np.cos(2 * np.pi / gravity_cycle * t) + 1) + gravity_base / 2
            elif gravity_change_pattern == 'gravity_averagely_easier':
                gravity = gravity_base * 1 / 2 * (np.cos(2 * np.pi / gravity_cycle * t) + 1)
            elif gravity_change_pattern == 'gravity_averagely_harder':
                gravity = gravity_base * 1 / 2 * (-np.cos(2 * np.pi / gravity_cycle * t) + 1) + gravity_base
            else:
                pass

            if 'PyBulletEnv' in env_name:
                env.env._p.setGravity(0, 0, gravity)
            elif 'Roboschool' in env_name:
                pass
            else:
                env.model.opt.gravity[2] = gravity
        
        # Step the env
        o2, r, d, _ = env.step(a)
            
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)
        
        # Add short history
        if max_hist_len != 0:
            if o_buff_len == max_hist_len:
                o_buff[:max_hist_len-1] = o_buff[1:]
                a_buff[:max_hist_len-1] = a_buff[1:]
                o_buff[max_hist_len-1] = list(o)
                a_buff[max_hist_len-1] = list(a)
            else:
                if a.shape[0]!=act_dim:
                    import pdb
                    pdb.set_trace()
                o_buff[o_buff_len+1-1] = list(o)
                a_buff[o_buff_len+1-1] = list(a)
                o_buff_len += 1
        
        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2
        
        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0
                
            if max_hist_len>0:
                o_buff = np.zeros([max_hist_len, obs_dim])
                a_buff = np.zeros([max_hist_len, act_dim])
                o_buff[0,:] = o
                o_buff_len = 0
            else:
                o_buff = np.zeros([1, obs_dim])
                a_buff = np.zeros([1, act_dim])
                o_buff_len = 0
        
        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch_with_history(batch_size, max_hist_len)
                batch = {k: v.cuda() for k,v in batch.items()}
                update(data=batch, timer=j)

        # End of epoch handling
        if (t+1) % steps_per_epoch == 0:
            epoch = (t+1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()

In [50]:
args = {'env': 'Ant-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'RTD3_MemGate_POMDP_Ant_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\RTD3_MemGate_POMDP_Ant_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze\RTD3_MemGate_POMDP_Ant_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"Ant-v2",
    "epochs":	50,
    "exp_name":	"RTD3_MemGate_POMDP_Ant_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x00000209EA135D88>":	{
            "epoch_dict":	{},
            "exp_name":	"RTD3_MemGate_POMDP_Ant_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze",
      



t=200, 0.12566375732421875s
t=400, 0.10871148109436035s
t=600, 0.10671401023864746s
t=800, 0.09773731231689453s
t=1000, 0.0967411994934082s
t=1200, 13.122312545776367s
t=1400, 13.113930702209473s
t=1600, 15.874975204467773s
t=1800, 15.427724838256836s
t=2000, 15.180763244628906s
t=2200, 15.567386150360107s
t=2400, 15.473840475082397s
t=2600, 16.079743146896362s
t=2800, 15.218315362930298s
t=3000, 15.481597661972046s
t=3200, 14.773526668548584s
t=3400, 14.510177373886108s
t=3600, 16.87888479232788s
t=3800, 15.759825468063354s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |             365 |
|          StdEpRet |             507 |
|          MaxEpRet |             900 |
|          MinEpRet |            -194 |
|  AverageTestEpRet |             864 |
|      StdTestEpRet |            17.2 |
|      MaxTestEpRet |             887 |
|      MinTestEpRet |             839 |
|             EpLen |             797 |
|         TestEpLen |        

In [48]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': False,
        'freeze_hist_coding': False,
        'exp_name': 'RTD3_NoMemGate_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\RTD3_NoMemGate_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze\RTD3_NoMemGate_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetah-v2",
    "epochs":	50,
    "exp_name":	"RTD3_NoMemGate_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x00000209EA16AE48>":	{
            "epoch_dict":	{},
            "exp_name":	"RTD3_NoMemGate_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L12



t=800, 0.039893388748168945s
t=1000, 0.03490638732910156s
t=1200, 9.249303579330444s
t=1400, 8.719646692276001s
t=1600, 8.617952823638916s
t=1800, 9.289445161819458s
t=2000, 8.647948980331421s
t=2200, 8.492283344268799s
t=2400, 8.910146474838257s
t=2600, 9.341049671173096s
t=2800, 8.919153213500977s
t=3000, 9.832670211791992s
t=3200, 10.021872520446777s
t=3400, 9.572370767593384s
t=3600, 10.17482304573059s
t=3800, 9.09268569946289s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -428 |
|          StdEpRet |             144 |
|          MaxEpRet |            -182 |
|          MinEpRet |            -551 |
|  AverageTestEpRet |            -406 |
|      StdTestEpRet |            32.4 |
|      MaxTestEpRet |            -332 |
|      MinTestEpRet |            -448 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           4e+03 |
|     AverageQ1Vals |          -0.563 |
|   

In [46]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': False,
        'freeze_hist_coding': False,
        'exp_name': 'RTD3_MemGate_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\RTD3_MemGate_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze\RTD3_MemGate_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetah-v2",
    "epochs":	50,
    "exp_name":	"RTD3_MemGate_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000002098FFA6B08>":	{
            "epoch_dict":	{},
            "exp_name":	"RTD3_MemGate_MDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidSta



t=800, 0.04588007926940918s
t=1000, 0.03290820121765137s
t=1200, 9.148796081542969s
t=1400, 8.632375717163086s
t=1600, 9.247405052185059s
t=1800, 9.051406621932983s
t=2000, 8.358648777008057s
t=2200, 9.787456750869751s
t=2400, 9.633272171020508s
t=2600, 9.167623043060303s
t=2800, 8.793485641479492s
t=3000, 8.858311653137207s
t=3200, 9.356375455856323s
t=3400, 8.833376169204712s
t=3600, 8.989960193634033s
t=3800, 9.163513898849487s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -457 |
|          StdEpRet |            96.9 |
|          MaxEpRet |            -290 |
|          MinEpRet |            -522 |
|  AverageTestEpRet |            -570 |
|      StdTestEpRet |            3.49 |
|      MaxTestEpRet |            -566 |
|      MinTestEpRet |            -575 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           4e+03 |
|     AverageQ1Vals |          -0.627 |
|    

In [44]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'RTD3_MemGate_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\RTD3_MemGate_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze\RTD3_MemGate_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetah-v2",
    "epochs":	50,
    "exp_name":	"RTD3_MemGate_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000002098BDB6C08>":	{
            "epoch_dict":	{},
            "exp_name":	"RTD3_MemGate_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L12



[32;1m
Number of parameters: 	 pi: 186374, 	 q1: 187265, 	 q2: 187265
[0m
t=0, 0.0s
t=200, 0.03789877891540527s
t=400, 0.028923749923706055s
t=600, 0.030918121337890625s
t=800, 0.027924537658691406s
t=1000, 0.02892303466796875s
t=1200, 9.28517460823059s
t=1400, 10.71418571472168s
t=1600, 10.764217138290405s
t=1800, 10.88688611984253s
t=2000, 10.978641271591187s
t=2200, 11.200080871582031s
t=2400, 9.960334539413452s
t=2600, 10.463020324707031s
t=2800, 10.324418544769287s
t=3000, 10.81298279762268s
t=3200, 10.512996196746826s
t=3400, 11.822676420211792s
t=3600, 10.832135200500488s
t=3800, 10.281315565109253s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -373 |
|          StdEpRet |             120 |
|          MaxEpRet |            -191 |
|          MinEpRet |            -527 |
|  AverageTestEpRet |            -379 |
|      StdTestEpRet |            22.8 |
|      MaxTestEpRet |            -370 |
|      MinTestEpRet |   

In [42]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'RTD3_NoMemGate_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\RTD3_NoMemGate_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze\RTD3_NoMemGate_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"HalfCheetah-v2",
    "epochs":	50,
    "exp_name":	"RTD3_NoMemGate_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_1L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000002098BDB6C08>":	{
            "epoch_dict":	{},
            "exp_name":	"RTD3_NoMemGate_POMDP_HalfCheetah_PreLSTM1L128_HistMemory5Len_



[32;1m
Number of parameters: 	 pi: 186374, 	 q1: 187265, 	 q2: 187265
[0m
t=0, 0.0s
t=200, 0.039893150329589844s
t=400, 0.03490638732910156s
t=600, 0.03390979766845703s
t=800, 0.0319516658782959s
t=1000, 0.04387617111206055s
t=1200, 9.922784566879272s
t=1400, 9.478551387786865s
t=1600, 9.07471776008606s
t=1800, 9.385892391204834s
t=2000, 9.4318106174469s
t=2200, 9.666118621826172s
t=2400, 9.423826694488525s
t=2600, 9.5464768409729s
t=2800, 10.541809320449829s
t=3000, 10.297365188598633s
t=3200, 10.484933376312256s
t=3400, 10.098025560379028s
t=3600, 10.224345207214355s
t=3800, 9.932472229003906s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -392 |
|          StdEpRet |            17.5 |
|          MaxEpRet |            -365 |
|          MinEpRet |            -409 |
|  AverageTestEpRet |             148 |
|      StdTestEpRet |             6.7 |
|      MaxTestEpRet |             157 |
|      MinTestEpRet |             1

In [39]:
args = {'env': 'Ant-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'nonstationary_env':False,
        'gravity_change_pattern': 'gravity_averagely_equal',
        'partially_observable': True,
        'freeze_hist_coding': False,
        'exp_name': 'test_pre_feature_extraction_gated_lstm_DDPG_POMDP_Ant_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(env_name=args['env'], actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
    nonstationary_env=args['nonstationary_env'], 
    gravity_change_pattern=args['gravity_change_pattern'],
    partially_observable=args['partially_observable'],
    freeze_hist_coding=args['freeze_hist_coding'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\test_pre_feature_extraction_gated_lstm_DDPG_POMDP_Ant_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze\test_pre_feature_extraction_gated_lstm_DDPG_POMDP_Ant_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_name":	"Ant-v2",
    "epochs":	50,
    "exp_name":	"test_pre_feature_extraction_gated_lstm_DDPG_POMDP_Ant_PreLSTM1L128_HistMemory5Len_LSTM1L128_HidStateLayer0_2L128_NoFreeze",
    "freeze_hist_coding":	false,
    "gamma":	0.99,
    "gravity_change_pattern":	"gravity_averagely_equal",
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000002098FFC2548>":	{
            "epoch_dict":	{},
            "exp_name":	"test_p