# State Representation shared by Q and Pi

In [1]:
from copy import deepcopy
import numpy as np
import torch
from torch.optim import Adam
import gym
import time
import spinup.algos.pytorch.lstm_ddpg.core as core
from spinup.utils.logx import EpochLogger
import itertools

In [2]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [40]:
class LSTMHistoryStateRep(nn.Module):
    def __init__(self, input_dim, lstm_hid_dim=128, lstm_hid_lay_num=1):
        super(LSTMObsStateRep, self).__init__()
        self.input_dim = input_dim
        self.lstm_hid_dim = lstm_hid_dim
        self.lstm_hid_lay_num = lstm_hid_lay_num
        
        self.layers = nn.ModuleList()
        self.layers += [nn.LSTM(input_dim, lstm_hid_dim, lstm_hid_lay_num, batch_first=True)]
        
    def forward(self, x, seg_len=None, gather_last=False):
        if seg_len is not None:
            input_packed = pack_padded_sequence(x, lengths=seg_len,
                                                batch_first=True, enforce_sorted=False)
        else:
            input_packed = pack_padded_sequence(x, lengths=[obs.size(1) for _ in range(obs.size(0))],
                                                batch_first=True, enforce_sorted=False)
        
        lstm_output_packed, (lstm_hidden_state, lstm_cell_state) = self.layers[0](input_packed)
        lstm_output_padded, lstm_output_lengths = pad_packed_sequence(lstm_output_packed, batch_first=True)
        output = lstm_output_padded
        if gather_last:
            return torch.gather(output, 1, (seg_len-1).view(-1,1).repeat(1, self.lstm_hid_dim).unsqueeze(1).long()).squeeze(1)
        else:
            return output
        
    
class LSTMActStateRep(nn.Module):
    def __init__(self, act_dim, lstm_hid_dim=128, lstm_hid_lay_num=1):
        super(LSTMActStateRep, self).__init__()
        self.act_dim = act_dim
        self.lstm_hid_dim = lstm_hid_dim
        self.lstm_hid_lay_num = lstm_hid_lay_num
        
        
    def forward(self, act, seg_len):
        pass

class LSTMStateRep(nn.Module):
    def __init__(self, obs_dim, act_dim, 
                 obs_lstm_hid_dim=128, obs_lstm_hid_lay_num=1,
                 act_lstm_hid_dim=128, act_lstm_hid_lay_num=1):
        super(LSTMStateRep, self).__init__()
        self.obs_state_rep = LSTMObsStateRep(obs_dim, obs_lstm_hid_dim=128, obs_lstm_hid_lay_num=1)
        self.act_state_rep = LSTMActStateRep(act_dim, act_lstm_hid_dim=128, act_lstm_hid_lay_num=1)
        
        
    def forward(self, obs, act, seg_len):
        obs_s = self.obs_state_rep(obs, seg_len)
        act_s = self.act_state_rep(act, seg_len-1)
        
class MLPActor(nn.Module):
    def __init__(self, state_rep):
        pass
        
class MLPCritic(nn.Module):
    def __init__(self, state_rep):
        pass
        
class MLPActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit):
        pass

In [364]:
class MLPCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_sizes=(128, 128)):
        super(MLPCritic, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        
        self.layers = nn.ModuleList()
        layer_size = [obs_dim+act_dim]+list(hidden_sizes) + [1]
        for h in range(len(layer_size)-2):
            self.layers += [nn.Linear(layer_size[h], layer_size[h+1]), nn.ReLU()]
        self.layers += [nn.Linear(layer_size[-2], layer_size[-1]), nn.Identity()]
    
    def forward(self, obs, act):
        cat_input = torch.cat([obs, act], dim=-1)
        x = cat_input
        for layer in self.layers:
            x = layer(x)
        return torch.squeeze(x, -1) # Critical to ensure q has right shape.

class MLPActor(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit, hidden_sizes=(128, 128)):
        super(MLPActor, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.act_limit = act_limit
        
        self.layers = nn.ModuleList()
        layer_size = [obs_dim]+list(hidden_sizes) + [act_dim]
        for h in range(len(layer_size)-2):
            self.layers += [nn.Linear(layer_size[h], layer_size[h+1]), nn.ReLU()]
        self.layers += [nn.Linear(layer_size[-2], layer_size[-1]), nn.Tanh()]
    
    def forward(self, obs):
        x = obs
        for layer in self.layers:
            x = layer(x)
        return self.act_limit * x

class MLPActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit, 
                 critic_hidden_sizes=(128, 128), actor_hidden_sizes=(128, 128)):
        super(MLPActorCritic, self).__init__()
        self.q1 = MLPCritic(obs_dim, act_dim, hidden_sizes=critic_hidden_sizes)
        self.q2 = MLPCritic(obs_dim, act_dim, hidden_sizes=critic_hidden_sizes)
        self.pi = MLPActor(obs_dim, act_dim, act_limit=1, hidden_sizes=actor_hidden_sizes)
    
    def act(self, obs):
        with torch.no_grad():
            return self.pi(obs).cpu().numpy() 
        

## Actor-Critic with History Memory

In [439]:
hist.shape

torch.Size([32, 1, 128])

In [437]:
torch.tensor([1 if s_l !=0 else 0 for s_l in seg_len]).view(-1,1).repeat(1, 128)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0,

In [431]:
torch.tensor([s_l if s_l !=0 else 1 for s_l in seg_len])

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1])

In [563]:
seg_len[0:4] = 10

In [564]:
seg_len

tensor([10., 10., 10., 10.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.])

In [566]:
seg_len[seg_len ==0] = 1
seg_len

tensor([10., 10., 10., 10.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.])

In [502]:
(seg_len != 0).float().view(-1,1).repeat(1, 12)


tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0

In [452]:
torch.relu(torch.sign(torch.tanh(seg_len))).view(-1,1).repeat(1, 12)

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0

In [441]:
torch.sigmoid(seg_len)-0.5

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.], dtype=torch.float64)

In [430]:
x = torch.tensor(np.random.randn(32, 1, 15)).float()
seg_len = torch.tensor(0*np.ones(32))
tmp_seg_len = torch.tensor([s_l if s_l !=0 else 1 for s_l in seg_len])
torch.tensor([1 if s_l !=0 else 0 for s_l in seg_len])
tmp_seg_msk = torch.ones([32, , 15])
pack_padded_sequence(x, lengths=seg_len, batch_first=True, enforce_sorted=False)

RuntimeError: Length of all samples has to be greater than 0, but found an element in 'lengths' that is <= 0

In [615]:
class MLPCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_sizes=(128, 128), 
                 lstm_hid_dim=128, lstm_hid_lay_num=1):
        super(MLPCritic, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.lstm_hid_dim = lstm_hid_dim
        
        self.layers = nn.ModuleList()
        # LSTM
        self.layers += [nn.LSTM(obs_dim+act_dim, lstm_hid_dim, lstm_hid_lay_num, batch_first=True)]
        # 
        layer_size = [lstm_hid_dim+obs_dim+act_dim]+list(hidden_sizes) + [1]
        for h in range(len(layer_size)-2):
            self.layers += [nn.Linear(layer_size[h], layer_size[h+1]), nn.ReLU()]
        self.layers += [nn.Linear(layer_size[-2], layer_size[-1]), nn.Identity()]
    
    def forward(self, obs, act, hist_obs, hist_act, hist_seg_len=None):
        #
        random_hist = torch.as_tensor(np.random.randn(obs.shape[0], self.lstm_hid_dim), dtype=torch.float32).cuda()
        #
        x = torch.cat([random_hist, obs, act], dim=-1)
        for layer in self.layers[1:]:
            x = layer(x)
        return torch.squeeze(x, -1) # Critical to ensure q has right shape.

class MLPActor(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit, hidden_sizes=(128, 128),
                 lstm_hid_dim=128, lstm_hid_lay_num=1):
        super(MLPActor, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.act_limit = act_limit
        self.lstm_hid_dim = lstm_hid_dim
        
        self.layers = nn.ModuleList()
        #
        self.layers += [nn.LSTM(obs_dim+act_dim, lstm_hid_dim, lstm_hid_lay_num, batch_first=True)]
        #
        layer_size = [lstm_hid_dim+obs_dim]+list(hidden_sizes) + [act_dim]
        for h in range(len(layer_size)-2):
            self.layers += [nn.Linear(layer_size[h], layer_size[h+1]), nn.ReLU()]
        self.layers += [nn.Linear(layer_size[-2], layer_size[-1]), nn.Tanh()]
    
    def forward(self, obs, hist_obs, hist_act, hist_seg_len=None):
        #
        random_hist = torch.as_tensor(np.random.randn(obs.shape[0], self.lstm_hid_dim), dtype=torch.float32).cuda()
        #
        x = torch.cat([random_hist, obs], dim=-1)
        for layer in self.layers[1:]:
            x = layer(x)
        return self.act_limit * x

class MLPActorCriticRandomHist(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit, 
                 critic_hidden_sizes=(128, 128), actor_hidden_sizes=(128, 128)):
        super(MLPActorCriticRandomHist, self).__init__()
        self.q1 = MLPCritic(obs_dim, act_dim, hidden_sizes=critic_hidden_sizes)
        self.q2 = MLPCritic(obs_dim, act_dim, hidden_sizes=critic_hidden_sizes)
        self.pi = MLPActor(obs_dim, act_dim, act_limit=1, hidden_sizes=actor_hidden_sizes)
    
    def act(self, obs, hist_obs=None, hist_act=None, hist_seg_len=None):
        if (hist_obs is None) or (hist_act is None) or (hist_seg_len is None):
            hist_obs = torch.zeros(1, 1, self.obs_dim).cuda()
            hist_act = torch.zeros(1, 1, self.act_dim).cuda()
            hist_seg_len = torch.zeros(1).cuda()
        with torch.no_grad():
            return self.pi(obs, hist_obs, hist_act, hist_seg_len).cpu().numpy() 
        

In [31]:
class MLPCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_sizes=(128, 128), 
                 lstm_hid_dim=64, lstm_hid_lay_num=2):
        super(MLPCritic, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.lstm_hid_dim = lstm_hid_dim
        
        self.layers = nn.ModuleList()
        # LSTM
        self.layers += [nn.LSTM(obs_dim+act_dim, lstm_hid_dim, lstm_hid_lay_num, batch_first=True)]
        # 
        layer_size = [lstm_hid_dim+obs_dim+act_dim]+list(hidden_sizes) + [1]
        for h in range(len(layer_size)-2):
            self.layers += [nn.Linear(layer_size[h], layer_size[h+1]), nn.ReLU()]
        self.layers += [nn.Linear(layer_size[-2], layer_size[-1]), nn.Identity()]
    
    def forward(self, obs, act, hist_obs, hist_act, hist_seg_len=None):
        #
        tmp_hist_seg_len = deepcopy(hist_seg_len)
        tmp_hist_seg_len[hist_seg_len == 0] = 1
        
        x = torch.cat([hist_obs, hist_act], dim=-1)
        if hist_seg_len is not None:
            input_packed = pack_padded_sequence(x, lengths=tmp_hist_seg_len,
                                                batch_first=True, enforce_sorted=False)
        else:
            input_packed = pack_padded_sequence(x, lengths=[obs.size(1) for _ in range(obs.size(0))],
                                                batch_first=True, enforce_sorted=False)
        
        lstm_output_packed, (lstm_hidden_state, lstm_cell_state) = self.layers[0](input_packed)
        lstm_output_padded, lstm_output_lengths = pad_packed_sequence(lstm_output_packed, batch_first=True)

        hist_out = torch.gather(lstm_output_padded, 1, (tmp_hist_seg_len-1).view(-1,1).repeat(1, self.lstm_hid_dim).unsqueeze(1).long()).squeeze(1)
        hist_msk = (hist_seg_len != 0).float().view(-1,1).repeat(1, self.lstm_hid_dim).cuda()
        #
        x = torch.cat([hist_out*hist_msk, obs, act], dim=-1)
        for layer in self.layers[1:]:
            x = layer(x)
        return torch.squeeze(x, -1) # Critical to ensure q has right shape.

class MLPActor(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit, hidden_sizes=(128, 128),
                 lstm_hid_dim=64, lstm_hid_lay_num=2):
        super(MLPActor, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.act_limit = act_limit
        self.lstm_hid_dim = lstm_hid_dim
        
        self.layers = nn.ModuleList()
        #
        self.layers += [nn.LSTM(obs_dim+act_dim, lstm_hid_dim, lstm_hid_lay_num, batch_first=True)]
        #
        layer_size = [lstm_hid_dim+obs_dim]+list(hidden_sizes) + [act_dim]
        for h in range(len(layer_size)-2):
            self.layers += [nn.Linear(layer_size[h], layer_size[h+1]), nn.ReLU()]
        self.layers += [nn.Linear(layer_size[-2], layer_size[-1]), nn.Tanh()]
    
    def forward(self, obs, hist_obs, hist_act, hist_seg_len=None):
        #
        tmp_hist_seg_len = deepcopy(hist_seg_len)
        tmp_hist_seg_len[hist_seg_len == 0] = 1
        
        x = torch.cat([hist_obs, hist_act], dim=-1)
        if hist_seg_len is not None:
            input_packed = pack_padded_sequence(x, lengths=tmp_hist_seg_len,
                                                batch_first=True, enforce_sorted=False)
        else:
            input_packed = pack_padded_sequence(x, lengths=[obs.size(1) for _ in range(obs.size(0))],
                                                batch_first=True, enforce_sorted=False)
        
        lstm_output_packed, (lstm_hidden_state, lstm_cell_state) = self.layers[0](input_packed)
        lstm_output_padded, lstm_output_lengths = pad_packed_sequence(lstm_output_packed, batch_first=True)
        
        hist_out = torch.gather(lstm_output_padded, 1, (tmp_hist_seg_len-1).view(-1,1).repeat(1, self.lstm_hid_dim).unsqueeze(1).long()).squeeze(1)
        hist_msk = (hist_seg_len != 0).float().view(-1,1).repeat(1, self.lstm_hid_dim).cuda()
        #
        x = torch.cat([hist_out*hist_msk, obs], dim=-1)
        for layer in self.layers[1:]:
            x = layer(x)
        return self.act_limit * x

class MLPActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit, 
                 critic_hidden_sizes=(128, 128), actor_hidden_sizes=(128, 128),
                 lstm_hid_dim=64, lstm_hid_lay_num=1):
        super(MLPActorCritic, self).__init__()
        self.q1 = MLPCritic(obs_dim, act_dim, hidden_sizes=critic_hidden_sizes,
                            lstm_hid_dim=lstm_hid_dim, lstm_hid_lay_num=lstm_hid_lay_num)
        self.q2 = MLPCritic(obs_dim, act_dim, hidden_sizes=critic_hidden_sizes,
                            lstm_hid_dim=lstm_hid_dim, lstm_hid_lay_num=lstm_hid_lay_num)
        self.pi = MLPActor(obs_dim, act_dim, act_limit=1, hidden_sizes=actor_hidden_sizes,
                           lstm_hid_dim=lstm_hid_dim, lstm_hid_lay_num=lstm_hid_lay_num)
    
    def act(self, obs, hist_obs=None, hist_act=None, hist_seg_len=None):
        if (hist_obs is None) or (hist_act is None) or (hist_seg_len is None):
            hist_obs = torch.zeros(1, 1, self.obs_dim).cuda()
            hist_act = torch.zeros(1, 1, self.act_dim).cuda()
            hist_seg_len = torch.zeros(1).cuda()
        with torch.no_grad():
            return self.pi(obs, hist_obs, hist_act, hist_seg_len).cpu().numpy() 
        

### Hist of observation only

In [36]:
class MLPCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_sizes=(128, 128), 
                 lstm_hid_dim=64, lstm_hid_lay_num=2):
        super(MLPCritic, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.lstm_hid_dim = lstm_hid_dim
        
        self.layers = nn.ModuleList()
        # LSTM
        self.layers += [nn.LSTM(obs_dim, lstm_hid_dim, lstm_hid_lay_num, batch_first=True)]
        # 
        layer_size = [lstm_hid_dim+obs_dim+act_dim]+list(hidden_sizes) + [1]
        for h in range(len(layer_size)-2):
            self.layers += [nn.Linear(layer_size[h], layer_size[h+1]), nn.ReLU()]
        self.layers += [nn.Linear(layer_size[-2], layer_size[-1]), nn.Identity()]
    
    def forward(self, obs, act, hist_obs, hist_act, hist_seg_len=None):
        #
        tmp_hist_seg_len = deepcopy(hist_seg_len)
        tmp_hist_seg_len[hist_seg_len == 0] = 1
        
        x = hist_obs
        if hist_seg_len is not None:
            input_packed = pack_padded_sequence(x, lengths=tmp_hist_seg_len,
                                                batch_first=True, enforce_sorted=False)
        else:
            input_packed = pack_padded_sequence(x, lengths=[obs.size(1) for _ in range(obs.size(0))],
                                                batch_first=True, enforce_sorted=False)
        
        lstm_output_packed, (lstm_hidden_state, lstm_cell_state) = self.layers[0](input_packed)
        lstm_output_padded, lstm_output_lengths = pad_packed_sequence(lstm_output_packed, batch_first=True)

        hist_out = torch.gather(lstm_output_padded, 1, (tmp_hist_seg_len-1).view(-1,1).repeat(1, self.lstm_hid_dim).unsqueeze(1).long()).squeeze(1)
        hist_msk = (hist_seg_len != 0).float().view(-1,1).repeat(1, self.lstm_hid_dim).cuda()
        #
        x = torch.cat([hist_out*hist_msk, obs, act], dim=-1)
        for layer in self.layers[1:]:
            x = layer(x)
        return torch.squeeze(x, -1) # Critical to ensure q has right shape.

class MLPActor(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit, hidden_sizes=(128, 128),
                 lstm_hid_dim=64, lstm_hid_lay_num=2):
        super(MLPActor, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.act_limit = act_limit
        self.lstm_hid_dim = lstm_hid_dim
        
        self.layers = nn.ModuleList()
        #
        self.layers += [nn.LSTM(obs_dim, lstm_hid_dim, lstm_hid_lay_num, batch_first=True)]
        #
        layer_size = [lstm_hid_dim+obs_dim]+list(hidden_sizes) + [act_dim]
        for h in range(len(layer_size)-2):
            self.layers += [nn.Linear(layer_size[h], layer_size[h+1]), nn.ReLU()]
        self.layers += [nn.Linear(layer_size[-2], layer_size[-1]), nn.Tanh()]
    
    def forward(self, obs, hist_obs, hist_act, hist_seg_len=None):
        #
        tmp_hist_seg_len = deepcopy(hist_seg_len)
        tmp_hist_seg_len[hist_seg_len == 0] = 1
        
        x = hist_obs
        if hist_seg_len is not None:
            input_packed = pack_padded_sequence(x, lengths=tmp_hist_seg_len,
                                                batch_first=True, enforce_sorted=False)
        else:
            input_packed = pack_padded_sequence(x, lengths=[obs.size(1) for _ in range(obs.size(0))],
                                                batch_first=True, enforce_sorted=False)
        
        lstm_output_packed, (lstm_hidden_state, lstm_cell_state) = self.layers[0](input_packed)
        lstm_output_padded, lstm_output_lengths = pad_packed_sequence(lstm_output_packed, batch_first=True)
        
        hist_out = torch.gather(lstm_output_padded, 1, (tmp_hist_seg_len-1).view(-1,1).repeat(1, self.lstm_hid_dim).unsqueeze(1).long()).squeeze(1)
        hist_msk = (hist_seg_len != 0).float().view(-1,1).repeat(1, self.lstm_hid_dim).cuda()
        #
        x = torch.cat([hist_out*hist_msk, obs], dim=-1)
        for layer in self.layers[1:]:
            x = layer(x)
        return self.act_limit * x

class MLPActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit, 
                 critic_hidden_sizes=(128, 128), actor_hidden_sizes=(128, 128),
                 lstm_hid_dim=64, lstm_hid_lay_num=1):
        super(MLPActorCritic, self).__init__()
        self.q1 = MLPCritic(obs_dim, act_dim, hidden_sizes=critic_hidden_sizes,
                            lstm_hid_dim=lstm_hid_dim, lstm_hid_lay_num=lstm_hid_lay_num)
        self.q2 = MLPCritic(obs_dim, act_dim, hidden_sizes=critic_hidden_sizes,
                            lstm_hid_dim=lstm_hid_dim, lstm_hid_lay_num=lstm_hid_lay_num)
        self.pi = MLPActor(obs_dim, act_dim, act_limit=1, hidden_sizes=actor_hidden_sizes,
                           lstm_hid_dim=lstm_hid_dim, lstm_hid_lay_num=lstm_hid_lay_num)
    
    def act(self, obs, hist_obs=None, hist_act=None, hist_seg_len=None):
        if (hist_obs is None) or (hist_act is None) or (hist_seg_len is None):
            hist_obs = torch.zeros(1, 1, self.obs_dim).cuda()
            hist_act = torch.zeros(1, 1, self.act_dim).cuda()
            hist_seg_len = torch.zeros(1).cuda()
        with torch.no_grad():
            return self.pi(obs, hist_obs, hist_act, hist_seg_len).cpu().numpy() 
        

In [24]:
ac = MLPActorCritic(12, 10, 1)
# print(ac)
ac.q1.layers[0].requires_grad=False

In [567]:
class LSTMHistoryNet(nn.Module):
    """LSTM of history (o, a) sequences."""
    def __init__(self, obs_dim, act_dim, lstm_hid_dim=128, lstm_hid_lay_num=1):
        super(LSTMHistoryNet, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.lstm_hid_dim = lstm_hid_dim
        self.lstm_hid_lay_num = lstm_hid_lay_num
        
        self.layers = nn.ModuleList()
        self.layers += [nn.LSTM(obs_dim+act_dim, lstm_hid_dim, lstm_hid_lay_num, batch_first=True)]
        
    def forward(self, obs, act, seg_len=None, gather_last=True):
        # tmp_seg_len make sure item where seg_len==0 to be 1
        tmp_seg_len = deepcopy(seg_len)
        tmp_seg_len[seg_len == 0] = 1
        
        x = torch.cat([obs, act], dim=-1)
        if seg_len is not None:
            input_packed = pack_padded_sequence(x, lengths=tmp_seg_len,
                                                batch_first=True, enforce_sorted=False)
        else:
            input_packed = pack_padded_sequence(x, lengths=[obs.size(1) for _ in range(obs.size(0))],
                                                batch_first=True, enforce_sorted=False)
        
        lstm_output_packed, (lstm_hidden_state, lstm_cell_state) = self.layers[0](input_packed)
        lstm_output_padded, lstm_output_lengths = pad_packed_sequence(lstm_output_packed, batch_first=True)
        
        output = lstm_output_padded
        if gather_last:
            hist_out = torch.gather(output, 1, (tmp_seg_len-1).view(-1,1).repeat(1, self.lstm_hid_dim).unsqueeze(1).long()).squeeze(1)
            hist_msk = (seg_len != 0).float().view(-1,1).repeat(1, self.lstm_hid_dim).cuda()
            return hist_out*hist_msk
        else:
            return output
        
class HiddenStateNet(nn.Module):
    def __init__(self, obs_dim, act_dim, hist_lstm_hid_dim=128, hist_lstm_hid_lay_num=1,
                 hidden_sizes=(256, 256,), no_history_memory=False):
        super(HiddenStateNet, self).__init__()
        self.no_history_memory = no_history_memory
        # Hidden State net
        self.hidden_state_layers = nn.ModuleList()
        if self.no_history_memory:
            layer_size = [obs_dim]+list(hidden_sizes)
        else:
            # History net
            self.hist_net = LSTMHistoryNet(obs_dim, act_dim, hist_lstm_hid_dim, hist_lstm_hid_lay_num)
            layer_size = [hist_lstm_hid_dim+obs_dim]+list(hidden_sizes)
        if len(layer_size)>=2:   
            for h in range(len(layer_size)-1):
                self.hidden_state_layers += [nn.Linear(layer_size[h], layer_size[h+1]), nn.ReLU()]
        
    def forward(self, obs, hist_obs, hist_act, hist_seg_len=None):
        if self.no_history_memory:
            state = torch.cat([obs], dim=-1)
        else:
            hist = self.hist_net(hist_obs, hist_act, hist_seg_len, gather_last=True)
            state = torch.cat([obs, hist], dim=-1)   # Concatenate history and current observation
        
        for layer in self.hidden_state_layers:
            state = layer(state)
        return state
        
class MLPCritic(nn.Module):
    def __init__(self, hid_state_dim, hid_state_net, 
                 act_dim, hidden_sizes=(128, 128)):
        super(MLPCritic, self).__init__()
        self.hid_state_dim = hid_state_dim
        self.hid_state_net = hid_state_net
        self.act_dim = act_dim
        
        self.layers = nn.ModuleList()
        layer_size = [hid_state_dim+act_dim]+list(hidden_sizes) + [1]
        for h in range(len(layer_size)-2):
            self.layers += [nn.Linear(layer_size[h], layer_size[h+1]), nn.ReLU()]
        self.layers += [nn.Linear(layer_size[-2], layer_size[-1]), nn.Identity()]
    
    def forward(self, obs, act, hist_obs, hist_act, hist_seg_len=None):
        # get hidden state
        hid_state = self.hid_state_net(obs, hist_obs, hist_act, hist_seg_len)
        # 
        x = torch.cat([hid_state, act], dim=-1)
        for layer in self.layers:
            x = layer(x)
        return torch.squeeze(x, -1) # Critical to ensure q has right shape.

class MLPActor(nn.Module):
    def __init__(self, hid_state_dim, hid_state_net,
                 act_dim, act_limit, hidden_sizes=(128, 128)):
        super(MLPActor, self).__init__()
        self.hid_state_dim = hid_state_dim
        self.hid_state_net = hid_state_net
        self.act_dim = act_dim
        self.act_limit = act_limit
        
        self.layers = nn.ModuleList()
        layer_size = [hid_state_dim]+list(hidden_sizes) + [act_dim]
        for h in range(len(layer_size)-2):
            self.layers += [nn.Linear(layer_size[h], layer_size[h+1]), nn.ReLU()]
        self.layers += [nn.Linear(layer_size[-2], layer_size[-1]), nn.Tanh()]
    
    def forward(self, obs, hist_obs, hist_act, hist_seg_len):
        # get hidden state
        hid_state = self.hid_state_net(obs, hist_obs, hist_act, hist_seg_len)
        x = hid_state
        for layer in self.layers:
            x = layer(x)
        return self.act_limit * x

class MLPActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit=1, 
                 hist_lstm_hid_dim=64, hist_lstm_hid_lay_num=1,
                 hid_state_layer_sizes=(128,), 
                 critic_hidden_sizes=(128,), actor_hidden_sizes=(128,), 
                 share_state_net=True, no_history_memory=False):
        super(MLPActorCritic, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.act_limit = act_limit
        # State Net
        if share_state_net:
            self.critic1_hid_state_net = HiddenStateNet(obs_dim, act_dim, 
                                                       hist_lstm_hid_dim, hist_lstm_hid_lay_num,
                                                       hid_state_layer_sizes,
                                                       no_history_memory=no_history_memory)
            self.critic2_hid_state_net = self.critic1_hid_state_net
            self.actor_hid_state_net = self.critic1_hid_state_net
        else:
            self.critic1_hid_state_net = HiddenStateNet(obs_dim, act_dim, 
                                                        hist_lstm_hid_dim, hist_lstm_hid_lay_num,
                                                        hid_state_layer_sizes,
                                                        no_history_memory=no_history_memory)
            self.critic2_hid_state_net = HiddenStateNet(obs_dim, act_dim, 
                                                        hist_lstm_hid_dim, hist_lstm_hid_lay_num,
                                                        hid_state_layer_sizes,
                                                       no_history_memory=no_history_memory)
            self.actor_hid_state_net = HiddenStateNet(obs_dim, act_dim, 
                                                       hist_lstm_hid_dim, hist_lstm_hid_lay_num,
                                                       hid_state_layer_sizes,
                                                       no_history_memory=no_history_memory)
        # Actor-Critic
        if no_history_memory:
            hist_dim = 0
        else:
            hist_dim = hist_lstm_hid_dim
            
        if len(hid_state_layer_sizes)==0:
            hidden_state_dim = hist_dim+self.obs_dim
        else:
            hidden_state_dim = hid_state_layer_sizes[-1]
        
        self.q1 = MLPCritic(hidden_state_dim, self.critic1_hid_state_net, act_dim, critic_hidden_sizes)
        self.q2 = MLPCritic(hidden_state_dim, self.critic2_hid_state_net, act_dim, critic_hidden_sizes)
        self.pi = MLPActor(hidden_state_dim, self.actor_hid_state_net, act_dim, act_limit, actor_hidden_sizes)
    
    def act(self, obs, hist_obs=None, hist_act=None, hist_seg_len=None):
        if (hist_obs is None) or (hist_act is None) or (hist_seg_len is None):
            hist_obs = torch.zeros(1, 1, self.obs_dim).cuda()
            hist_act = torch.zeros(1, 1, self.act_dim).cuda()
            hist_seg_len = torch.zeros(1).cuda()
        with torch.no_grad():
            return self.pi(obs, hist_obs, hist_act, hist_seg_len).cpu().numpy() 
        

In [465]:
print(hidden_state_net )

HiddenStateNet(
  (hidden_state_layers): ModuleList()
  (hist_net): LSTMHistoryNet(
    (layers): ModuleList(
      (0): LSTM(25, 128, batch_first=True)
    )
  )
)


In [472]:
obs_dim = 15
act_dim = 10
# hist_net = LSTMHistoryNet(obs_dim, act_dim)
hist_lstm_hid_dim=128
hist_lstm_hid_lay_num=1 
hid_state_layer_sizes=()
hist_net = LSTMHistoryNet(obs_dim, act_dim)
hidden_state_net = HiddenStateNet(obs_dim, act_dim, 
                                  hist_lstm_hid_dim, hist_lstm_hid_lay_num,
                                  hid_state_layer_sizes, no_history_memory=False)
q1 = MLPCritic(0, hidden_state_net, act_dim)
# pi = MLPActor(0, hidden_state_net, act_dim, act_limit=1)
# ac = MLPActorCritic(obs_dim, act_dim, hid_state_layer_sizes=(), share_state_net=False)
ac = MLPActorCritic(obs_dim, act_dim, hid_state_layer_sizes=(),
                        critic_hidden_sizes=(128, 128), actor_hidden_sizes=(128,128),
                        share_state_net=False, no_history_memory=True)

In [473]:
print(hist_net)

LSTMHistoryNet(
  (layers): ModuleList(
    (0): LSTM(25, 128, batch_first=True)
  )
)


In [474]:
max_hist_len = 0
if max_hist_len == 0:
    hist_obs = torch.tensor(np.zeros([batch_size, max_hist_len+1, obs_dim])).float()
    hist_act = torch.tensor(np.zeros([batch_size, max_hist_len+1, act_dim])).float()
    hist_len = torch.tensor((max_hist_len) * np.ones(batch_size)).float()
    hist_msk = torch.tensor(np.zeros([batch_size, max_hist_len+1, 128])).float()
hist = hist_net(hist_obs, hist_act, hist_len)

In [475]:
hist_len

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])

In [478]:
hist

tensor([[-0., 0., -0.,  ..., 0., -0., -0.],
        [-0., 0., -0.,  ..., 0., -0., -0.],
        [-0., 0., -0.,  ..., 0., -0., -0.],
        ...,
        [-0., 0., -0.,  ..., 0., -0., -0.],
        [-0., 0., -0.,  ..., 0., -0., -0.],
        [-0., 0., -0.,  ..., 0., -0., -0.]], grad_fn=<MulBackward0>)

tensor([[[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]]])

In [371]:
# print(hidden_state_net)
# print(q1)
print(ac)

MLPActorCritic(
  (critic_hid_state_net): HiddenStateNet(
    (hist_net): LSTMHistoryNet(
      (layers): ModuleList(
        (0): LSTM(25, 64, batch_first=True)
      )
    )
    (hidden_state_layers): ModuleList()
  )
  (actor_hid_state_net): HiddenStateNet(
    (hist_net): LSTMHistoryNet(
      (layers): ModuleList(
        (0): LSTM(25, 64, batch_first=True)
      )
    )
    (hidden_state_layers): ModuleList()
  )
  (q1): MLPCritic(
    (hid_state_net): HiddenStateNet(
      (hist_net): LSTMHistoryNet(
        (layers): ModuleList(
          (0): LSTM(25, 64, batch_first=True)
        )
      )
      (hidden_state_layers): ModuleList()
    )
    (layers): ModuleList(
      (0): Linear(in_features=25, out_features=128, bias=True)
      (1): ReLU()
      (2): Linear(in_features=128, out_features=128, bias=True)
      (3): ReLU()
      (4): Linear(in_features=128, out_features=1, bias=True)
      (5): Identity()
    )
  )
  (q2): MLPCritic(
    (hid_state_net): HiddenStateNet(
      

In [393]:
import numpy as np
batch_size = 32
max_seg_len = 10

hist_obs = torch.tensor(np.random.randn(batch_size, max_seg_len, obs_dim)).float()
hist_act = torch.tensor(np.random.randn(batch_size, max_seg_len, act_dim)).float()
hist_seg_len = torch.tensor(np.ones(batch_size)*max_seg_len).float()

obs = torch.tensor(np.random.randn(batch_size, obs_dim)).float()
act = torch.tensor(np.random.randn(batch_size, act_dim)).float()
act.shape


torch.Size([32, 10])

In [348]:
hist_seg_len.shape

torch.Size([32])

In [349]:
pi(obs, hist_obs, hist_act, hist_seg_len).shape

torch.Size([32, 10])

In [350]:
hid_state = hidden_state_net(obs, hist_obs, hist_act, hist_seg_len)
hid_state.shape
# q1(obs, act, hist_obs, hist_act, hist_seg_len)

torch.Size([32, 15])

## Replay Buffer

In [4]:
class ReplayBuffer:
    """
    A simple FIFO experience replay buffer for agents.
    """

    def __init__(self, obs_dim, act_dim, max_size):
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.max_size = max_size
        self.obs_buf = np.zeros(core.combined_shape(max_size, obs_dim), dtype=np.float32)
        self.obs2_buf = np.zeros(core.combined_shape(max_size, obs_dim), dtype=np.float32)
        self.act_buf = np.zeros(core.combined_shape(max_size, act_dim), dtype=np.float32)
        self.rew_buf = np.zeros(max_size, dtype=np.float32)
        self.done_buf = np.zeros(max_size, dtype=np.float32)
        self.ptr, self.size = 0, 0

    def store(self, obs, act, rew, next_obs, done):
        self.obs_buf[self.ptr] = obs
        self.act_buf[self.ptr] = act
        self.rew_buf[self.ptr] = rew
        self.obs2_buf[self.ptr] = list(next_obs)
        self.done_buf[self.ptr] = done
        self.ptr = (self.ptr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)

    def sample_batch(self, batch_size=32):
        idxs = np.random.randint(0, self.size, size=batch_size)
        batch = dict(obs=self.obs_buf[idxs],
                     obs2=self.obs2_buf[idxs],
                     act=self.act_buf[idxs],
                     rew=self.rew_buf[idxs],
                     done=self.done_buf[idxs])
        return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()}
    
    def sample_batch_with_history(self, batch_size=32, max_hist_len=100):
        idxs = np.random.randint(max_hist_len, self.size, size=batch_size)
        # History 
        if max_hist_len == 0:
            hist_obs = np.zeros([batch_size, 1, self.obs_dim])
            hist_act = np.zeros([batch_size, 1, self.act_dim])
            hist_obs2 = np.zeros([batch_size, 1, self.obs_dim])
            hist_act2 = np.zeros([batch_size, 1, self.act_dim])
            hist_rew = np.zeros([batch_size, 1])
            hist_done = np.zeros([batch_size, 1])
            hist_len = np.zeros(batch_size)
#             hist_msk = np.tile((hist_len!=0).astype(float).reshape([-1,1]), [1, 12]).shape
        else:
            hist_obs = np.zeros([batch_size, max_hist_len, self.obs_dim])
            hist_act = np.zeros([batch_size, max_hist_len, self.act_dim])
            hist_obs2 = np.zeros([batch_size, max_hist_len, self.obs_dim])
            hist_act2 = np.zeros([batch_size, max_hist_len, self.act_dim])
            hist_rew = np.zeros([batch_size, max_hist_len])
            hist_done = np.zeros([batch_size, max_hist_len])
            hist_len = max_hist_len * np.ones(batch_size)
            for hist_i in range(max_hist_len):
                hist_obs[:, -1-hist_i, :] = self.obs_buf[idxs-hist_i-1, :]
                hist_act[:, -1-hist_i, :] = self.act_buf[idxs-hist_i-1, :]
                hist_obs2[:, -1-hist_i, :] = self.obs2_buf[idxs-hist_i-1, :]
                hist_act2[:, -1-hist_i, :] = self.act_buf[idxs-hist_i, :]  # include a_t
                hist_rew[:, -1-hist_i] = self.rew_buf[idxs-hist_i-1]
                hist_done[:, -1-hist_i] = self.done_buf[idxs-hist_i-1]
            # If there is done in the backward experiences, only consider the experiences after the last done.
            for batch_i in range(batch_size):
                done_idxs_exclude_last_exp = np.where(hist_done[batch_i][:-1] == 1)  # Exclude last experience
                # If exist done
                if done_idxs_exclude_last_exp[0].size != 0:
                    largest_done_id = done_idxs_exclude_last_exp[0][-1]
                    hist_len[batch_i] = max_hist_len - (largest_done_id+1)

                    # Only keep experiences after the last done
                    obs_keep_part = np.copy(hist_obs[batch_i, largest_done_id+1:, :])
                    act_keep_part = np.copy(hist_act[batch_i, largest_done_id+1:, :])
                    obs2_keep_part = np.copy(hist_obs2[batch_i, largest_done_id+1:, :])
                    act2_keep_part = np.copy(hist_act2[batch_i, largest_done_id+1:, :])
                    rew_keep_part = np.copy(hist_rew[batch_i, largest_done_id+1:])
                    done_keep_part = np.copy(hist_done[batch_i, largest_done_id+1:])

                    # Set to 0 to make sure all experiences are at the beginning
                    hist_obs[batch_i] = np.zeros([max_hist_len, self.obs_dim])
                    hist_act[batch_i] = np.zeros([max_hist_len, self.act_dim])
                    hist_obs2[batch_i] = np.zeros([max_hist_len, self.obs_dim])
                    hist_act2[batch_i] = np.zeros([max_hist_len, self.act_dim])
                    hist_rew[batch_i] = np.zeros([max_hist_len])
                    hist_done[batch_i] = np.zeros([max_hist_len])

                    # Move kept experiences to the start of the segment
                    hist_obs[batch_i, :max_hist_len-(largest_done_id+1), :] = obs_keep_part
                    hist_act[batch_i, :max_hist_len-(largest_done_id+1), :] = act_keep_part
                    hist_obs2[batch_i, :max_hist_len-(largest_done_id+1), :] = obs2_keep_part
                    hist_act2[batch_i, :max_hist_len-(largest_done_id+1), :] = act2_keep_part
                    hist_rew[batch_i, :max_hist_len-(largest_done_id+1)] = rew_keep_part
                    hist_done[batch_i, :max_hist_len-(largest_done_id+1)] = done_keep_part
        # 
        batch = dict(obs=self.obs_buf[idxs],
                     obs2=self.obs2_buf[idxs],
                     act=self.act_buf[idxs],
                     rew=self.rew_buf[idxs],
                     done=self.done_buf[idxs],
                     hist_obs=hist_obs,
                     hist_act=hist_act,
                     hist_obs2=hist_obs2,
                     hist_act2=hist_act2,
                     hist_rew=hist_rew,
                     hist_done=hist_done,
                     hist_len=hist_len)
        return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()}
        

In [177]:
import gym
env = gym.make('HalfCheetah-v2')

obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
act_limit = env.action_space.high[0]

replay_buffer = ReplayBuffer(obs_dim, act_dim, int(1e6))

replay_buffer.size

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


0

In [178]:
#
total_steps = 10000
o, ep_len = env.reset(), 0

# Main loop: collect experience in env and update/log each epoch
for t in range(total_steps):
    a = env.action_space.sample()
    # Step the env
    o2, r, d, _ = env.step(a)
    ep_len += 1
    
    # Store experience to replay buffer
    replay_buffer.store(o, a, r, o2, d)
    
    o = o2
    
    # End of trajectory handling
    if d or (ep_len == 1000):
        o, ep_len = env.reset(), 0

In [179]:
batch = replay_buffer.sample_batch_with_history(batch_size=32, max_hist_len=100)

In [180]:
batch.keys()

dict_keys(['obs', 'obs2', 'act', 'rew', 'done', 'hist_obs', 'hist_act', 'hist_obs2', 'hist_act2', 'hist_rew', 'hist_done', 'hist_len'])

In [524]:
np.tile((hist_len!=0).astype(float).reshape([-1,1]), [1, 12]).shape

(32, 12)

In [520]:
hist_len = np.zeros(batch_size)
hist_len

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [188]:
data = batch
o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']
h_o, h_a, h_o2, h_a2, h_len = data['hist_obs'], data['hist_act'], data['hist_obs2'], data['hist_act2'], data['hist_len']


In [172]:
ac = MLPActorCritic(obs_dim, act_dim)

In [192]:
ac.q1(o, a, hist_o, hist_a, hist_len)
ac.pi(o2, h_o2, h_a2, h_len)

tensor([[ 0.0316,  0.0567,  0.0405,  0.0322,  0.0781, -0.1170],
        [ 0.0156,  0.0837,  0.0223,  0.0231,  0.0463, -0.1419],
        [ 0.0441,  0.0721,  0.0353,  0.0331,  0.0355, -0.1695],
        [ 0.0449,  0.0727,  0.0434,  0.0263,  0.0561, -0.1729],
        [ 0.0541,  0.0636,  0.0446,  0.0469,  0.0571, -0.1381],
        [ 0.0678,  0.0600,  0.0248,  0.0372,  0.0513, -0.1356],
        [ 0.0365,  0.0449,  0.0362,  0.0280,  0.0255, -0.1182],
        [ 0.0386,  0.0656,  0.0220,  0.0347,  0.0457, -0.1299],
        [ 0.0522,  0.0639,  0.0267,  0.0393,  0.0510, -0.1362],
        [ 0.0605,  0.0488,  0.0121,  0.0442,  0.0574, -0.1404],
        [ 0.0395,  0.0629,  0.0230,  0.0454,  0.0565, -0.1214],
        [ 0.0281,  0.0713,  0.0226,  0.0238,  0.0501, -0.1361],
        [ 0.0245,  0.0499,  0.0284,  0.0284,  0.0404, -0.1200],
        [ 0.0306,  0.0533,  0.0309,  0.0204,  0.0415, -0.1372],
        [ 0.0306,  0.0630,  0.0468,  0.0170,  0.0414, -0.1446],
        [ 0.0514,  0.0629,  0.0293,  0.0

In [175]:
ac.q2(o, a, hist_o, hist_a, hist_len)

tensor([-0.0237, -0.0381, -0.0364, -0.0396, -0.0022, -0.0212, -0.0436,  0.0023,
        -0.0413, -0.0558, -0.0076, -0.0235, -0.0360, -0.0688, -0.0292, -0.0249,
        -0.0106, -0.0418, -0.0171, -0.0601, -0.0390, -0.0242, -0.0379, -0.0314,
        -0.0561, -0.0290, -0.0060, -0.0482, -0.0502, -0.0357, -0.0067, -0.0346],
       grad_fn=<SqueezeBackward1>)

In [261]:
print(ac)

MLPActorCritic(
  (hid_state_net): HiddenStateNet(
    (hist_net): LSTMHistoryNet(
      (layers): ModuleList(
        (0): LSTM(23, 128, batch_first=True)
      )
    )
    (hidden_state_layers): ModuleList(
      (0): Linear(in_features=145, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=256, bias=True)
      (3): ReLU()
    )
  )
  (q1): MLPCritic(
    (hid_state_net): HiddenStateNet(
      (hist_net): LSTMHistoryNet(
        (layers): ModuleList(
          (0): LSTM(23, 128, batch_first=True)
        )
      )
      (hidden_state_layers): ModuleList(
        (0): Linear(in_features=145, out_features=256, bias=True)
        (1): ReLU()
        (2): Linear(in_features=256, out_features=256, bias=True)
        (3): ReLU()
      )
    )
    (layers): ModuleList(
      (0): Linear(in_features=262, out_features=128, bias=True)
      (1): ReLU()
      (2): Linear(in_features=128, out_features=128, bias=True)
      (3): ReLU()
      (4): Line

In [262]:
list(ac.q1.parameters())

[Parameter containing:
 tensor([[ 0.0298, -0.0221, -0.0804,  ..., -0.0422, -0.0664,  0.0520],
         [-0.0132, -0.0251,  0.0341,  ..., -0.0037, -0.0287, -0.0219],
         [-0.0365, -0.0433, -0.0059,  ..., -0.0622, -0.0162, -0.0569],
         ...,
         [ 0.0221,  0.0556, -0.0161,  ...,  0.0451,  0.0104, -0.0262],
         [ 0.0542,  0.0565, -0.0500,  ..., -0.0880, -0.0403,  0.0215],
         [ 0.0169, -0.0637, -0.0798,  ...,  0.0453,  0.0105, -0.0763]],
        requires_grad=True),
 Parameter containing:
 tensor([[-0.0482, -0.0661,  0.0425,  ...,  0.0416,  0.0868,  0.0256],
         [ 0.0882, -0.0609,  0.0280,  ...,  0.0741,  0.0497, -0.0882],
         [ 0.0285,  0.0631, -0.0294,  ...,  0.0066,  0.0079, -0.0730],
         ...,
         [ 0.0261, -0.0515,  0.0115,  ...,  0.0457,  0.0512, -0.0458],
         [ 0.0467, -0.0868,  0.0432,  ...,  0.0654, -0.0873,  0.0500],
         [-0.0082, -0.0319,  0.0020,  ..., -0.0182,  0.0683, -0.0066]],
        requires_grad=True),
 Parameter con

In [253]:
list(ac.pi.parameters())

[Parameter containing:
 tensor([[ 0.0298, -0.0221, -0.0804,  ..., -0.0422, -0.0664,  0.0520],
         [-0.0132, -0.0251,  0.0341,  ..., -0.0037, -0.0287, -0.0219],
         [-0.0365, -0.0433, -0.0059,  ..., -0.0622, -0.0162, -0.0569],
         ...,
         [ 0.0221,  0.0556, -0.0161,  ...,  0.0451,  0.0104, -0.0262],
         [ 0.0542,  0.0565, -0.0500,  ..., -0.0880, -0.0403,  0.0215],
         [ 0.0169, -0.0637, -0.0798,  ...,  0.0453,  0.0105, -0.0763]],
        requires_grad=True),
 Parameter containing:
 tensor([[-0.0482, -0.0661,  0.0425,  ...,  0.0416,  0.0868,  0.0256],
         [ 0.0882, -0.0609,  0.0280,  ...,  0.0741,  0.0497, -0.0882],
         [ 0.0285,  0.0631, -0.0294,  ...,  0.0066,  0.0079, -0.0730],
         ...,
         [ 0.0261, -0.0515,  0.0115,  ...,  0.0457,  0.0512, -0.0458],
         [ 0.0467, -0.0868,  0.0432,  ...,  0.0654, -0.0873,  0.0500],
         [-0.0082, -0.0319,  0.0020,  ..., -0.0182,  0.0683, -0.0066]],
        requires_grad=True),
 Parameter con

In [None]:
## 

In [37]:
def td3(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 
        steps_per_epoch=2000, epochs=100, replay_size=int(1e6), gamma=0.99, 
        polyak=0.995, pi_lr=1e-3, q_lr=1e-3, 
        batch_size=100, max_hist_len=100,
        start_steps=1000, 
        update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, 
        noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, 
        logger_kwargs=dict(), save_freq=1):
    """
    Twin Delayed Deep Deterministic Policy Gradient (TD3)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of 
            observations as inputs, and ``q1`` and ``q2`` should accept a batch 
            of observations and a batch of actions as inputs. When called, 
            these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current 
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target 
            policy.

        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.

        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
#     ac = MLPActorCritic(obs_dim, act_dim, hid_state_layer_sizes=(),
#                         critic_hidden_sizes=(128, 128), actor_hidden_sizes=(128,128),
#                         share_state_net=True, no_history_memory=False)
    ac = MLPActorCritic(obs_dim, act_dim, act_limit, 
                        critic_hidden_sizes=(128, 128), actor_hidden_sizes=(128, 128),
                        lstm_hid_dim=64, lstm_hid_lay_num=2)
#     ac = MLPActorCriticRandomHist(obs_dim, act_dim, act_limit, 
#                                   critic_hidden_sizes=(128, 128), actor_hidden_sizes=(128, 128))
    ac_targ = deepcopy(ac)
    ac.cuda()
    ac_targ.cuda()
    
    # Freeze hist coding
    ac.q1.layers[0].requires_grad=False
    ac.q2.layers[0].requires_grad=False
    ac.pi.layers[0].requires_grad=False

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False
        
    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, max_size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n'%var_counts)

    # Set up function for computing TD3 Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']
        h_o, h_a, h_o2, h_a2, h_len = data['hist_obs'], data['hist_act'], data['hist_obs2'], data['hist_act2'], data['hist_len']

        q1 = ac.q1(o, a, h_o, h_a, h_len)
        q2 = ac.q2(o, a, h_o, h_a, h_len)
#         q1 = ac.q1(o, a)
#         q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            pi_targ = ac_targ.pi(o2, h_o2, h_a2, h_len)
#             pi_targ = ac_targ.pi(o2)
            
            # Target policy smoothing
            epsilon = torch.randn_like(pi_targ) * target_noise
            epsilon = torch.clamp(epsilon, -noise_clip, noise_clip)
            a2 = pi_targ + epsilon
            a2 = torch.clamp(a2, -act_limit, act_limit)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2, h_o2, h_a2, h_len)
            q2_pi_targ = ac_targ.q2(o2, a2, h_o2, h_a2, h_len)
#             q1_pi_targ = ac_targ.q1(o2, a2)
#             q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * q_pi_targ

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        loss_info = dict(Q1Vals=q1.detach().cpu().numpy(),
                         Q2Vals=q2.detach().cpu().numpy())

        return loss_q, loss_info

    # Set up function for computing TD3 pi loss
    def compute_loss_pi(data):
        o, h_o, h_a, h_len = data['obs'], data['hist_obs'], data['hist_act'], data['hist_len']
        q1_pi = ac.q1(o, ac.pi(o, h_o, h_a, h_len), h_o, h_a, h_len)
#         q1_pi = ac.q1(o, ac.pi(o))
        return -q1_pi.mean()

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    q_optimizer = Adam(q_params, lr=q_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data, timer):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **loss_info)

        # Possibly update pi and target networks
        if timer % policy_delay == 0:

#             # Freeze Q-networks so you don't waste computational effort 
#             # computing gradients for them during the policy learning step.
#             for p in q_params:
#                 p.requires_grad = False

            # Next run one gradient descent step for pi.
            pi_optimizer.zero_grad()
            loss_pi = compute_loss_pi(data)
            loss_pi.backward()
            pi_optimizer.step()

#             # Unfreeze Q-networks so you can optimize it at next DDPG step.
#             for p in q_params:
#                 p.requires_grad = True

            # Record things
            logger.store(LossPi=loss_pi.item())

            # Finally, update target networks by polyak averaging.
            with torch.no_grad():
                for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                    # NB: We use an in-place operations "mul_", "add_" to update target
                    # params, as opposed to "mul" and "add", which would make new tensors.
                    p_targ.data.mul_(polyak)
                    p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, o_buff, a_buff, o_buff_len, noise_scale):
        h_o = torch.tensor(o_buff).view(1, o_buff.shape[0], o_buff.shape[1]).float().cuda()
        h_a = torch.tensor(a_buff).view(1, a_buff.shape[0], a_buff.shape[1]).float().cuda()
        h_l = torch.tensor([o_buff_len]).float().cuda()
        with torch.no_grad(): 
            a = ac.act(torch.as_tensor(o, dtype=torch.float32).view(1,-1).cuda(),
                       h_o, h_a, h_l).reshape(act_dim)
        a += noise_scale * np.random.randn(act_dim)
        if a.shape[0]!=act_dim:
            import pdb
            pdb.set_trace()
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            if max_hist_len>0:
                o_buff = np.zeros([max_hist_len, obs_dim])
                a_buff = np.zeros([max_hist_len, act_dim])
                o_buff[0,:] = o
                o_buff_len = 0
            else:
                o_buff = np.zeros([1, obs_dim])
                a_buff = np.zeros([1, act_dim])
                o_buff_len = 0
                
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                a = get_action(o, o_buff, a_buff, o_buff_len, 0)
                o2, r, d, _ = test_env.step(a)
                ep_ret += r
                ep_len += 1
                # Add short history
                if max_hist_len != 0:
                    if o_buff_len == max_hist_len:
                        o_buff[:max_hist_len-1] = o_buff[1:]
                        a_buff[:max_hist_len-1] = a_buff[1:]
                        o_buff[max_hist_len-1] = list(o)
                        a_buff[max_hist_len-1] = list(a)
                    else:
                        if a.shape[0]!=act_dim:
                            import pdb
                            pdb.set_trace()
                        o_buff[o_buff_len+1-1] = list(o)
                        a_buff[o_buff_len+1-1] = list(a)
                        o_buff_len += 1
                o = o2
                
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
    if max_hist_len>0:
        o_buff = np.zeros([max_hist_len, obs_dim])
        a_buff = np.zeros([max_hist_len, act_dim])
        o_buff[0,:] = o
        o_buff_len = 0
    else:
        o_buff = np.zeros([1, obs_dim])
        a_buff = np.zeros([1, act_dim])
        o_buff_len = 0

    # Main loop: collect experience in env and update/log each epoch
    start_time = time.time()
    for t in range(total_steps):
        if t%200 == 0:
            end_time = time.time()
            print("t={}, {}s".format(t, end_time-start_time))
            start_time = end_time
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards, 
        # use the learned policy (with some noise, via act_noise). 
        if t > start_steps:
            a = get_action(o, o_buff, a_buff, o_buff_len, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)
        
        # Add short history
        if max_hist_len != 0:
            if o_buff_len == max_hist_len:
                o_buff[:max_hist_len-1] = o_buff[1:]
                a_buff[:max_hist_len-1] = a_buff[1:]
                o_buff[max_hist_len-1] = list(o)
                a_buff[max_hist_len-1] = list(a)
            else:
                if a.shape[0]!=act_dim:
                    import pdb
                    pdb.set_trace()
                o_buff[o_buff_len+1-1] = list(o)
                a_buff[o_buff_len+1-1] = list(a)
                o_buff_len += 1
        
        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2
        
        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0
            if max_hist_len>0:
                o_buff = np.zeros([max_hist_len, obs_dim])
                a_buff = np.zeros([max_hist_len, act_dim])
                o_buff[0,:] = o
                o_buff_len = 0
            else:
                o_buff = np.zeros([1, obs_dim])
                a_buff = np.zeros([1, act_dim])
                o_buff_len = 0
        
        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch_with_history(batch_size, max_hist_len)
                batch = {k: v.cuda() for k,v in batch.items()}
                update(data=batch, timer=j)

        # End of epoch handling
        if (t+1) % steps_per_epoch == 0:
            epoch = (t+1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()

In [38]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 10,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly\lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x000002783E9F6798>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x00000278660D7648>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHistObsOnly",
            "first_row":	true,
        

  result = entry_point.load(False)


t=1000, 0.02695608139038086s
t=1200, 12.775810956954956s
t=1400, 12.7090163230896s
t=1600, 13.022179365158081s
t=1800, 13.023175954818726s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -405 |
|          StdEpRet |             132 |
|          MaxEpRet |            -273 |
|          MinEpRet |            -538 |
|  AverageTestEpRet |            -531 |
|      StdTestEpRet |           0.751 |
|      MaxTestEpRet |            -530 |
|      MinTestEpRet |            -532 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.446 |
|         StdQ1Vals |           0.784 |
|         MaxQ1Vals |            3.53 |
|         MinQ1Vals |           -4.06 |
|     AverageQ2Vals |          -0.446 |
|         StdQ2Vals |           0.786 |
|         MaxQ2Vals |            3.42 |
|         MinQ2Vals |           -3.93 |
|            LossPi |

t=14000, 59.16381239891052s
t=14200, 12.457689762115479s
t=14400, 13.315392971038818s
t=14600, 13.131886005401611s
t=14800, 13.103959798812866s
t=15000, 13.198706150054932s
t=15200, 12.86163592338562s
t=15400, 12.349948644638062s
t=15600, 12.75489330291748s
t=15800, 12.735942840576172s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |             348 |
|          StdEpRet |             122 |
|          MaxEpRet |             469 |
|          MinEpRet |             226 |
|  AverageTestEpRet |             362 |
|      StdTestEpRet |            53.7 |
|      MaxTestEpRet |             474 |
|      MinTestEpRet |             289 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |           -3.47 |
|         StdQ1Vals |            5.34 |
|         MaxQ1Vals |            28.2 |
|         MinQ1Vals |           -31.1 |
|     AverageQ2Vals |           -

t=28000, 52.07888746261597s
t=28200, 10.806102991104126s
t=28400, 10.989614248275757s
t=28600, 10.894866943359375s
t=28800, 11.648850917816162s
t=29000, 11.16816258430481s
t=29200, 10.995571374893188s
t=29400, 11.035491228103638s
t=29600, 11.672786951065063s
t=29800, 12.486281633377075s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |             776 |
|          StdEpRet |            80.7 |
|          MaxEpRet |             856 |
|          MinEpRet |             695 |
|  AverageTestEpRet |             680 |
|      StdTestEpRet |            98.2 |
|      MaxTestEpRet |             828 |
|      MinTestEpRet |             467 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |           0.146 |
|         StdQ1Vals |            10.2 |
|         MaxQ1Vals |            30.7 |
|         MinQ1Vals |           -41.4 |
|     AverageQ2Vals |           

t=42000, 52.10367512702942s
t=42200, 10.929772853851318s
t=42400, 11.265878438949585s
t=42600, 11.848315715789795s
t=42800, 11.81843638420105s
t=43000, 11.861285209655762s
t=43200, 13.380218744277954s
t=43400, 11.58206295967102s
t=43600, 11.85694169998169s
t=43800, 11.599980592727661s
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |             805 |
|          StdEpRet |            47.7 |
|          MaxEpRet |             853 |
|          MinEpRet |             757 |
|  AverageTestEpRet |             628 |
|      StdTestEpRet |             398 |
|      MaxTestEpRet |             856 |
|      MinTestEpRet |            -556 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |            3.78 |
|         StdQ1Vals |            15.8 |
|         MaxQ1Vals |            32.8 |
|         MinQ1Vals |           -47.8 |
|     AverageQ2Vals |            3

t=56000, 54.49528694152832s
t=56200, 12.742383480072021s
t=56400, 12.382852554321289s
t=56600, 11.132201194763184s
t=56800, 11.410489082336426s
t=57000, 11.862281322479248s
t=57200, 13.610634088516235s
t=57400, 11.658794164657593s
t=57600, 11.744594812393188s
t=57800, 12.402836322784424s
---------------------------------------
|             Epoch |              29 |
|      AverageEpRet |             197 |
|          StdEpRet |             733 |
|          MaxEpRet |             929 |
|          MinEpRet |            -536 |
|  AverageTestEpRet |             734 |
|      StdTestEpRet |             104 |
|      MaxTestEpRet |             890 |
|      MinTestEpRet |             614 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.8e+04 |
|     AverageQ1Vals |            7.15 |
|         StdQ1Vals |            19.6 |
|         MaxQ1Vals |            37.9 |
|         MinQ1Vals |           -66.6 |
|     AverageQ2Vals |          

t=70000, 53.23673129081726s
t=70200, 11.942078590393066s
t=70400, 12.017864227294922s
t=70600, 12.298115015029907s
t=70800, 11.30078125s
t=71000, 12.37731409072876s
t=71200, 12.697047710418701s
t=71400, 12.331026792526245s
t=71600, 12.901501417160034s
t=71800, 12.196386814117432s
---------------------------------------
|             Epoch |              36 |
|      AverageEpRet |             829 |
|          StdEpRet |            57.3 |
|          MaxEpRet |             886 |
|          MinEpRet |             772 |
|  AverageTestEpRet |             817 |
|      StdTestEpRet |            82.9 |
|      MaxTestEpRet |             977 |
|      MinTestEpRet |             712 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         7.2e+04 |
|     AverageQ1Vals |            8.94 |
|         StdQ1Vals |            24.5 |
|         MaxQ1Vals |            42.2 |
|         MinQ1Vals |           -78.3 |
|     AverageQ2Vals |            8.94 |

t=84000, 54.2240309715271s
t=84200, 12.494561910629272s
t=84400, 12.492594480514526s
t=84600, 11.79246711730957s
t=84800, 11.4992516040802s
t=85000, 11.411484956741333s
t=85200, 11.332729816436768s
t=85400, 11.556093692779541s
t=85600, 11.988913297653198s
t=85800, 11.779501914978027s
---------------------------------------
|             Epoch |              43 |
|      AverageEpRet |             750 |
|          StdEpRet |            56.4 |
|          MaxEpRet |             807 |
|          MinEpRet |             694 |
|  AverageTestEpRet |             710 |
|      StdTestEpRet |              69 |
|      MaxTestEpRet |             805 |
|      MinTestEpRet |             588 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         8.6e+04 |
|     AverageQ1Vals |            13.1 |
|         StdQ1Vals |            26.4 |
|         MaxQ1Vals |            45.1 |
|         MinQ1Vals |           -80.8 |
|     AverageQ2Vals |            13

t=98000, 52.45074129104614s
t=98200, 11.241917610168457s
t=98400, 11.368600130081177s
t=98600, 11.149187803268433s
t=98800, 11.047459125518799s
t=99000, 10.960690975189209s
t=99200, 11.004573583602905s
t=99400, 11.088349342346191s
t=99600, 11.277848720550537s
t=99800, 11.158162593841553s
---------------------------------------
|             Epoch |              50 |
|      AverageEpRet |             672 |
|          StdEpRet |            2.73 |
|          MaxEpRet |             674 |
|          MinEpRet |             669 |
|  AverageTestEpRet |             509 |
|      StdTestEpRet |             325 |
|      MaxTestEpRet |             724 |
|      MinTestEpRet |            -441 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+05 |
|     AverageQ1Vals |            16.8 |
|         StdQ1Vals |            25.9 |
|         MaxQ1Vals |            48.8 |
|         MinQ1Vals |             -77 |
|     AverageQ2Vals |          

In [35]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 10,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHist\lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x000002783E9EAF78>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHist",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000002783BA52FC8>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHist",
            "first_row":	true,
            "log_current_row":	{},
 

  result = entry_point.load(False)


t=1200, 11.682729005813599s
t=1400, 12.542140007019043s
t=1600, 12.922446012496948s
t=1800, 10.963683366775513s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -354 |
|          StdEpRet |             196 |
|          MaxEpRet |            -158 |
|          MinEpRet |            -550 |
|  AverageTestEpRet |            -595 |
|      StdTestEpRet |           0.602 |
|      MaxTestEpRet |            -595 |
|      MinTestEpRet |            -596 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.289 |
|         StdQ1Vals |           0.765 |
|         MaxQ1Vals |            3.66 |
|         MinQ1Vals |           -2.82 |
|     AverageQ2Vals |          -0.289 |
|         StdQ2Vals |           0.764 |
|         MaxQ2Vals |            3.49 |
|         MinQ2Vals |           -2.86 |
|            LossPi |          -0.103 |
|       

t=14000, 56.19373965263367s
t=14200, 11.512214183807373s
t=14400, 11.41148567199707s
t=14600, 11.277842998504639s
t=14800, 11.391539096832275s
t=15000, 11.542165279388428s
t=15200, 11.3197021484375s
t=15400, 11.674781560897827s
t=15600, 11.303773641586304s
t=15800, 11.395528554916382s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |             366 |
|          StdEpRet |             111 |
|          MaxEpRet |             478 |
|          MinEpRet |             255 |
|  AverageTestEpRet |             535 |
|      StdTestEpRet |            91.4 |
|      MaxTestEpRet |             690 |
|      MinTestEpRet |             351 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |           -1.27 |
|         StdQ1Vals |            5.42 |
|         MaxQ1Vals |            23.6 |
|         MinQ1Vals |           -29.4 |
|     AverageQ2Vals |           -1

t=28000, 56.40356397628784s
t=28200, 11.985308408737183s
t=28400, 11.593000411987305s
t=28600, 11.78252124786377s
t=28800, 11.405505180358887s
t=29000, 12.212313413619995s
t=29200, 11.855298042297363s
t=29400, 12.256226778030396s
t=29600, 12.798776388168335s
t=29800, 11.645859003067017s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |             587 |
|          StdEpRet |            62.5 |
|          MaxEpRet |             650 |
|          MinEpRet |             524 |
|  AverageTestEpRet |             665 |
|      StdTestEpRet |            99.1 |
|      MaxTestEpRet |             794 |
|      MinTestEpRet |             437 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |            4.17 |
|         StdQ1Vals |            13.7 |
|         MaxQ1Vals |            29.1 |
|         MinQ1Vals |           -40.9 |
|     AverageQ2Vals |           

t=42000, 57.91757297515869s
t=42200, 12.005929946899414s
t=42400, 12.097650051116943s
t=42600, 11.605932712554932s
t=42800, 11.788511991500854s
t=43000, 11.716636896133423s
t=43200, 12.284182786941528s
t=43400, 12.059720754623413s
t=43600, 12.284180164337158s
t=43800, 11.956998109817505s
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |             711 |
|          StdEpRet |            69.7 |
|          MaxEpRet |             781 |
|          MinEpRet |             641 |
|  AverageTestEpRet |             497 |
|      StdTestEpRet |             305 |
|      MaxTestEpRet |             717 |
|      MinTestEpRet |            -343 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |            10.7 |
|         StdQ1Vals |            18.8 |
|         MaxQ1Vals |            37.4 |
|         MinQ1Vals |           -44.5 |
|     AverageQ2Vals |          

t=56000, 59.16693735122681s
t=56200, 12.672956228256226s
t=56400, 12.203838109970093s
t=56600, 13.701124906539917s
t=56800, 12.8182692527771s
t=57000, 13.214416027069092s
t=57200, 13.481974363327026s
t=57400, 14.06886076927185s
t=57600, 13.909690618515015s
t=57800, 12.700054168701172s
---------------------------------------
|             Epoch |              29 |
|      AverageEpRet |             811 |
|          StdEpRet |              44 |
|          MaxEpRet |             855 |
|          MinEpRet |             767 |
|  AverageTestEpRet |             715 |
|      StdTestEpRet |            47.1 |
|      MaxTestEpRet |             808 |
|      MinTestEpRet |             659 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.8e+04 |
|     AverageQ1Vals |            15.2 |
|         StdQ1Vals |              23 |
|         MaxQ1Vals |            42.9 |
|         MinQ1Vals |           -52.5 |
|     AverageQ2Vals |            1

t=70000, 50.36731839179993s
t=70200, 10.426120281219482s
t=70400, 10.366309404373169s
t=70600, 10.568742513656616s
t=70800, 10.51688003540039s
t=71000, 10.393203735351562s
t=71200, 10.548795223236084s
t=71400, 10.511893510818481s
t=71600, 10.841010808944702s
t=71800, 10.546796798706055s
---------------------------------------
|             Epoch |              36 |
|      AverageEpRet |             739 |
|          StdEpRet |             107 |
|          MaxEpRet |             846 |
|          MinEpRet |             632 |
|  AverageTestEpRet |             665 |
|      StdTestEpRet |             192 |
|      MaxTestEpRet |             888 |
|      MinTestEpRet |             263 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         7.2e+04 |
|     AverageQ1Vals |            19.4 |
|         StdQ1Vals |            25.4 |
|         MaxQ1Vals |            47.6 |
|         MinQ1Vals |           -53.1 |
|     AverageQ2Vals |           

t=84000, 66.19431900978088s
t=84200, 17.58868169784546s
t=84400, 15.407532930374146s
t=84600, 18.131980419158936s
t=84800, 16.44064950942993s
t=85000, 15.734139680862427s
t=85200, 15.274560689926147s
t=85400, 14.411462783813477s
t=85600, 14.583005428314209s
t=85800, 14.365586280822754s
---------------------------------------
|             Epoch |              43 |
|      AverageEpRet |             767 |
|          StdEpRet |            38.2 |
|          MaxEpRet |             805 |
|          MinEpRet |             728 |
|  AverageTestEpRet |             664 |
|      StdTestEpRet |             183 |
|      MaxTestEpRet |             944 |
|      MinTestEpRet |             326 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         8.6e+04 |
|     AverageQ1Vals |            21.4 |
|         StdQ1Vals |            27.2 |
|         MaxQ1Vals |            49.1 |
|         MinQ1Vals |             -55 |
|     AverageQ2Vals |            

KeyboardInterrupt: 

In [33]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 10,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHist_FreezeHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHist_FreezeHist\lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHist_FreezeHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x00000278025FAD38>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHist_FreezeHist",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000002783BB55D08>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory10Len_LSTM2L64_HidStateLayer0_2L128_NoFreeze_MLPHist_FreezeHist",
            "first_row"

  result = entry_point.load(False)


t=1200, 11.851855993270874s
t=1400, 11.542742252349854s
t=1600, 11.689244508743286s
t=1800, 10.692408323287964s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -399 |
|          StdEpRet |             154 |
|          MaxEpRet |            -245 |
|          MinEpRet |            -554 |
|  AverageTestEpRet |            -598 |
|      StdTestEpRet |           0.473 |
|      MaxTestEpRet |            -597 |
|      MinTestEpRet |            -598 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.329 |
|         StdQ1Vals |           0.796 |
|         MaxQ1Vals |            4.05 |
|         MinQ1Vals |           -3.22 |
|     AverageQ2Vals |           -0.33 |
|         StdQ2Vals |           0.794 |
|         MaxQ2Vals |            3.96 |
|         MinQ2Vals |           -3.24 |
|            LossPi |          -0.136 |
|       

t=14000, 55.49360990524292s
t=14200, 11.461353778839111s
t=14400, 11.5989830493927s
t=14600, 11.50922417640686s
t=14800, 11.596989393234253s
t=15000, 11.479304075241089s
t=15200, 11.648853063583374s
t=15400, 11.522187948226929s
t=15600, 11.463346481323242s
t=15800, 12.864975452423096s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |            -546 |
|          StdEpRet |            5.33 |
|          MaxEpRet |            -541 |
|          MinEpRet |            -551 |
|  AverageTestEpRet |            -584 |
|      StdTestEpRet |            23.5 |
|      MaxTestEpRet |            -519 |
|      MinTestEpRet |            -599 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |             -14 |
|         StdQ1Vals |            4.16 |
|         MaxQ1Vals |            27.8 |
|         MinQ1Vals |           -30.6 |
|     AverageQ2Vals |             

t=28000, 55.19241642951965s
t=28200, 11.151180505752563s
t=28400, 11.096363067626953s
t=28600, 11.150150060653687s
t=28800, 11.264877557754517s
t=29000, 11.407496452331543s
t=29200, 11.28382658958435s
t=29400, 11.75257396697998s
t=29600, 12.10964846611023s
t=29800, 11.321696281433105s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |            -335 |
|          StdEpRet |            42.1 |
|          MaxEpRet |            -293 |
|          MinEpRet |            -377 |
|  AverageTestEpRet |            -176 |
|      StdTestEpRet |            60.9 |
|      MaxTestEpRet |           -89.8 |
|      MinTestEpRet |            -277 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |           -19.4 |
|         StdQ1Vals |            4.16 |
|         MaxQ1Vals |            18.5 |
|         MinQ1Vals |           -39.8 |
|     AverageQ2Vals |           -1

t=42000, 65.6437451839447s
t=42200, 11.567036628723145s
t=42400, 11.324726343154907s
t=42600, 12.028832912445068s
t=42800, 11.299785852432251s
t=43000, 11.57705307006836s
t=43200, 11.58800220489502s
t=43400, 11.6398766040802s
t=43600, 11.540140151977539s
t=43800, 11.639875173568726s
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |             846 |
|          StdEpRet |            51.7 |
|          MaxEpRet |             897 |
|          MinEpRet |             794 |
|  AverageTestEpRet |             988 |
|      StdTestEpRet |            53.7 |
|      MaxTestEpRet |        1.08e+03 |
|      MinTestEpRet |             917 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |           -12.2 |
|         StdQ1Vals |            8.44 |
|         MaxQ1Vals |            27.1 |
|         MinQ1Vals |             -40 |
|     AverageQ2Vals |           -12.

t=56000, 54.6658501625061s
t=56200, 11.481272220611572s
t=56400, 11.486285924911499s
t=56600, 11.583027362823486s
t=56800, 11.531165361404419s
t=57000, 12.779826402664185s
t=57200, 11.362616539001465s
t=57400, 11.47431755065918s
t=57600, 11.640872240066528s
t=57800, 12.171454191207886s
---------------------------------------
|             Epoch |              29 |
|      AverageEpRet |        2.28e+03 |
|          StdEpRet |             2.6 |
|          MaxEpRet |        2.28e+03 |
|          MinEpRet |        2.28e+03 |
|  AverageTestEpRet |        2.31e+03 |
|      StdTestEpRet |            78.6 |
|      MaxTestEpRet |        2.46e+03 |
|      MinTestEpRet |         2.2e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.8e+04 |
|     AverageQ1Vals |            4.03 |
|         StdQ1Vals |            21.8 |
|         MaxQ1Vals |            68.6 |
|         MinQ1Vals |             -34 |
|     AverageQ2Vals |            

t=70000, 56.62005066871643s
t=70200, 11.748608350753784s
t=70400, 11.577014684677124s
t=70600, 12.027531862258911s
t=70800, 11.911150455474854s
t=71000, 13.156818389892578s
t=71200, 12.787806272506714s
t=71400, 12.56240701675415s
t=71600, 12.818726539611816s
t=71800, 12.582350730895996s
---------------------------------------
|             Epoch |              36 |
|      AverageEpRet |        2.87e+03 |
|          StdEpRet |            69.1 |
|          MaxEpRet |        2.94e+03 |
|          MinEpRet |         2.8e+03 |
|  AverageTestEpRet |        2.98e+03 |
|      StdTestEpRet |            57.3 |
|      MaxTestEpRet |         3.1e+03 |
|      MinTestEpRet |        2.91e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         7.2e+04 |
|     AverageQ1Vals |              39 |
|         StdQ1Vals |            41.7 |
|         MaxQ1Vals |             124 |
|         MinQ1Vals |           -29.8 |
|     AverageQ2Vals |           

t=84000, 55.381908893585205s
t=84200, 11.529170513153076s
t=84400, 12.134554624557495s
t=84600, 12.545452356338501s
t=84800, 12.378896474838257s
t=85000, 12.825705528259277s
t=85200, 12.476637125015259s
t=85400, 12.197383642196655s
t=85600, 12.712015151977539s
t=85800, 12.637201309204102s
---------------------------------------
|             Epoch |              43 |
|      AverageEpRet |        3.04e+03 |
|          StdEpRet |            21.2 |
|          MaxEpRet |        3.06e+03 |
|          MinEpRet |        3.02e+03 |
|  AverageTestEpRet |        3.23e+03 |
|      StdTestEpRet |              38 |
|      MaxTestEpRet |         3.3e+03 |
|      MinTestEpRet |        3.17e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         8.6e+04 |
|     AverageQ1Vals |            73.7 |
|         StdQ1Vals |              53 |
|         MaxQ1Vals |             160 |
|         MinQ1Vals |           -16.3 |
|     AverageQ2Vals |         

t=98000, 55.90151882171631s
t=98200, 11.577043771743774s
t=98400, 11.75157642364502s
t=98600, 11.572083950042725s
t=98800, 12.128568410873413s
t=99000, 13.302402973175049s
t=99200, 14.330678939819336s
t=99400, 12.273180723190308s
t=99600, 12.126943111419678s
t=99800, 12.036780595779419s
---------------------------------------
|             Epoch |              50 |
|      AverageEpRet |        3.21e+03 |
|          StdEpRet |            30.5 |
|          MaxEpRet |        3.24e+03 |
|          MinEpRet |        3.18e+03 |
|  AverageTestEpRet |        3.39e+03 |
|      StdTestEpRet |            33.7 |
|      MaxTestEpRet |        3.46e+03 |
|      MinTestEpRet |        3.35e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+05 |
|     AverageQ1Vals |             104 |
|         StdQ1Vals |            59.8 |
|         MaxQ1Vals |             186 |
|         MinQ1Vals |          -0.658 |
|     AverageQ2Vals |           

In [26]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Share_HistMemory5Len_LSTM1L64_HidStateLayer0_2L128_NoFreeze_MLPHist_FreezeHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Share_HistMemory5Len_LSTM1L64_HidStateLayer0_2L128_NoFreeze_MLPHist_FreezeHist\lstm_td3_HalfCheetah_Share_HistMemory5Len_LSTM1L64_HidStateLayer0_2L128_NoFreeze_MLPHist_FreezeHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x000002783E9535E8>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory5Len_LSTM1L64_HidStateLayer0_2L128_NoFreeze_MLPHist_FreezeHist",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x00000278025F3BC8>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory5Len_LSTM1L64_HidStateLayer0_2L128_NoFreeze_MLPHist_FreezeHist",
            "first_row":	tr

  result = entry_point.load(False)


t=1200, 9.456433534622192s
t=1400, 9.309107542037964s
t=1600, 8.917154788970947s
t=1800, 9.555449485778809s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -366 |
|          StdEpRet |             143 |
|          MaxEpRet |            -223 |
|          MinEpRet |            -509 |
|  AverageTestEpRet |            -570 |
|      StdTestEpRet |           0.843 |
|      MaxTestEpRet |            -569 |
|      MinTestEpRet |            -571 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.455 |
|         StdQ1Vals |            0.66 |
|         MaxQ1Vals |            2.97 |
|         MinQ1Vals |           -4.13 |
|     AverageQ2Vals |          -0.455 |
|         StdQ2Vals |           0.663 |
|         MaxQ2Vals |            2.93 |
|         MinQ2Vals |            -3.8 |
|            LossPi |          0.0461 |
|           

t=14000, 61.7359619140625s
t=14200, 12.59496784210205s
t=14400, 11.76783537864685s
t=14600, 12.4988694190979s
t=14800, 11.478076457977295s
t=15000, 12.032298803329468s
t=15200, 11.671780347824097s
t=15400, 12.05522346496582s
t=15600, 13.032956600189209s
t=15800, 14.749070405960083s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |             179 |
|          StdEpRet |            8.11 |
|          MaxEpRet |             187 |
|          MinEpRet |             171 |
|  AverageTestEpRet |             268 |
|      StdTestEpRet |            62.5 |
|      MaxTestEpRet |             379 |
|      MinTestEpRet |             188 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |           -4.85 |
|         StdQ1Vals |            3.38 |
|         MaxQ1Vals |            18.7 |
|         MinQ1Vals |           -24.1 |
|     AverageQ2Vals |           -4.85

t=28000, 55.28334045410156s
t=28200, 11.241946935653687s
t=28400, 11.196078538894653s
t=28600, 10.746819257736206s
t=28800, 10.548462390899658s
t=29000, 10.676066875457764s
t=29200, 10.590195894241333s
t=29400, 10.573553800582886s
t=29600, 10.79834794998169s
t=29800, 10.761019229888916s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |             869 |
|          StdEpRet |            37.3 |
|          MaxEpRet |             907 |
|          MinEpRet |             832 |
|  AverageTestEpRet |             874 |
|      StdTestEpRet |            88.7 |
|      MaxTestEpRet |             941 |
|      MinTestEpRet |             631 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |            1.14 |
|         StdQ1Vals |            7.13 |
|         MaxQ1Vals |            25.8 |
|         MinQ1Vals |           -29.9 |
|     AverageQ2Vals |           

t=42000, 59.17961263656616s
t=42200, 10.77469778060913s
t=42400, 10.88777232170105s
t=42600, 12.122169256210327s
t=42800, 11.579976797103882s
t=43000, 10.775771617889404s
t=43200, 11.041840553283691s
t=43400, 10.6863534450531s
t=43600, 10.770029306411743s
t=43800, 11.226755142211914s
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |             949 |
|          StdEpRet |            12.4 |
|          MaxEpRet |             961 |
|          MinEpRet |             937 |
|  AverageTestEpRet |             689 |
|      StdTestEpRet |             440 |
|      MaxTestEpRet |             966 |
|      MinTestEpRet |            -356 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |            15.6 |
|         StdQ1Vals |            12.8 |
|         MaxQ1Vals |            40.8 |
|         MinQ1Vals |             -29 |
|     AverageQ2Vals |            15

t=56000, 57.01057410240173s
t=56200, 10.893005609512329s
t=56400, 11.681405544281006s
t=56600, 11.837080478668213s
t=56800, 11.3784499168396s
t=57000, 11.393157482147217s
t=57200, 11.287713527679443s
t=57400, 11.811911582946777s
t=57600, 12.293723583221436s
t=57800, 11.748120546340942s
---------------------------------------
|             Epoch |              29 |
|      AverageEpRet |        1.05e+03 |
|          StdEpRet |            3.69 |
|          MaxEpRet |        1.05e+03 |
|          MinEpRet |        1.04e+03 |
|  AverageTestEpRet |        1.01e+03 |
|      StdTestEpRet |            81.2 |
|      MaxTestEpRet |        1.07e+03 |
|      MinTestEpRet |             778 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.8e+04 |
|     AverageQ1Vals |            27.3 |
|         StdQ1Vals |            15.8 |
|         MaxQ1Vals |            48.4 |
|         MinQ1Vals |           -32.3 |
|     AverageQ2Vals |            

KeyboardInterrupt: 

In [17]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Share_HistMemory5Len_LSTM1L64_HidStateLayer0_2L128_NoFreeze_MLPHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Share_HistMemory5Len_LSTM1L64_HidStateLayer0_2L128_NoFreeze_MLPHist\lstm_td3_HalfCheetah_Share_HistMemory5Len_LSTM1L64_HidStateLayer0_2L128_NoFreeze_MLPHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x000002783DB22F78>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory5Len_LSTM1L64_HidStateLayer0_2L128_NoFreeze_MLPHist",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000002783E9FF688>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory5Len_LSTM1L64_HidStateLayer0_2L128_NoFreeze_MLPHist",
            "first_row":	true,
            "log_current_row":	{},
     

  result = entry_point.load(False)


t=1200, 11.046433925628662s
t=1400, 10.44207763671875s
t=1600, 12.804193019866943s
t=1800, 10.657530546188354s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -405 |
|          StdEpRet |             134 |
|          MaxEpRet |            -272 |
|          MinEpRet |            -539 |
|  AverageTestEpRet |            -600 |
|      StdTestEpRet |            0.96 |
|      MaxTestEpRet |            -598 |
|      MinTestEpRet |            -601 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.593 |
|         StdQ1Vals |           0.748 |
|         MaxQ1Vals |            4.43 |
|         MinQ1Vals |           -5.13 |
|     AverageQ2Vals |          -0.593 |
|         StdQ2Vals |           0.749 |
|         MaxQ2Vals |             4.4 |
|         MinQ2Vals |              -5 |
|            LossPi |           0.472 |
|        

t=14000, 66.23985052108765s
t=14200, 10.965705156326294s
t=14400, 11.658431768417358s
t=14600, 10.964711904525757s
t=14800, 11.178106784820557s
t=15000, 10.96567153930664s
t=15200, 12.083665609359741s
t=15400, 10.482978582382202s
t=15600, 12.528823375701904s
t=15800, 12.264278411865234s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |            -463 |
|          StdEpRet |            77.5 |
|          MaxEpRet |            -386 |
|          MinEpRet |            -541 |
|  AverageTestEpRet |            -593 |
|      StdTestEpRet |            62.2 |
|      MaxTestEpRet |            -511 |
|      MinTestEpRet |            -711 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |           -6.17 |
|         StdQ1Vals |               4 |
|         MaxQ1Vals |            15.1 |
|         MinQ1Vals |           -23.2 |
|     AverageQ2Vals |           

t=28000, 52.625245094299316s
t=28200, 10.139886379241943s
t=28400, 9.916482210159302s
t=28600, 10.11595106124878s
t=28800, 10.113954305648804s
t=29000, 9.750954151153564s
t=29200, 9.949395895004272s
t=29400, 10.032145023345947s
t=29600, 10.090019464492798s
t=29800, 9.968344926834106s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |             427 |
|          StdEpRet |            8.41 |
|          MaxEpRet |             436 |
|          MinEpRet |             419 |
|  AverageTestEpRet |             505 |
|      StdTestEpRet |            41.9 |
|      MaxTestEpRet |             567 |
|      MinTestEpRet |             427 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |           -9.54 |
|         StdQ1Vals |            6.17 |
|         MaxQ1Vals |            19.3 |
|         MinQ1Vals |           -33.1 |
|     AverageQ2Vals |           -9.

t=42000, 52.58239436149597s
t=42200, 9.94141674041748s
t=42400, 9.958400249481201s
t=42600, 10.064059734344482s
t=42800, 10.056138515472412s
t=43000, 9.996241807937622s
t=43200, 9.95936918258667s
t=43400, 10.056108713150024s
t=43600, 10.321433305740356s
t=43800, 9.834711790084839s
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |             610 |
|          StdEpRet |            45.1 |
|          MaxEpRet |             656 |
|          MinEpRet |             565 |
|  AverageTestEpRet |             702 |
|      StdTestEpRet |            37.5 |
|      MaxTestEpRet |             769 |
|      MinTestEpRet |             634 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |           -5.07 |
|         StdQ1Vals |            7.11 |
|         MaxQ1Vals |            18.4 |
|         MinQ1Vals |             -38 |
|     AverageQ2Vals |           -5.07 

KeyboardInterrupt: 

In [16]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 15,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Share_HistMemory15Len_LSTM1L64_HidStateLayer0_2L128_NoFreeze_MLPHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Share_HistMemory15Len_LSTM1L64_HidStateLayer0_2L128_NoFreeze_MLPHist\lstm_td3_HalfCheetah_Share_HistMemory15Len_LSTM1L64_HidStateLayer0_2L128_NoFreeze_MLPHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000027878B18708>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory15Len_LSTM1L64_HidStateLayer0_2L128_NoFreeze_MLPHist",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000027802864088>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory15Len_LSTM1L64_HidStateLayer0_2L128_NoFreeze_MLPHist",
            "first_row":	true,
            "log_current_row":	{},
 

  result = entry_point.load(False)


t=1200, 10.065124273300171s
t=1400, 10.207666635513306s
t=1600, 9.831748485565186s
t=1800, 9.868572235107422s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -387 |
|          StdEpRet |             133 |
|          MaxEpRet |            -254 |
|          MinEpRet |            -520 |
|  AverageTestEpRet |             180 |
|      StdTestEpRet |            66.7 |
|      MaxTestEpRet |             252 |
|      MinTestEpRet |            32.1 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.358 |
|         StdQ1Vals |           0.815 |
|         MaxQ1Vals |            3.45 |
|         MinQ1Vals |           -3.45 |
|     AverageQ2Vals |          -0.358 |
|         StdQ2Vals |           0.816 |
|         MaxQ2Vals |            3.39 |
|         MinQ2Vals |           -3.44 |
|            LossPi |           -0.19 |
|         

t=14000, 53.15187168121338s
t=14200, 10.012226581573486s
t=14400, 10.159860849380493s
t=14600, 10.795105218887329s
t=14800, 10.832064390182495s
t=15000, 10.69437551498413s
t=15200, 10.67645263671875s
t=15400, 10.658496141433716s
t=15600, 10.667475700378418s
t=15800, 10.3942391872406s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |            -351 |
|          StdEpRet |             112 |
|          MaxEpRet |            -239 |
|          MinEpRet |            -463 |
|  AverageTestEpRet |            -226 |
|      StdTestEpRet |            74.7 |
|      MaxTestEpRet |           -29.7 |
|      MinTestEpRet |            -310 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |           -6.96 |
|         StdQ1Vals |            5.45 |
|         MaxQ1Vals |            21.3 |
|         MinQ1Vals |           -41.8 |
|     AverageQ2Vals |           -6.

t=28000, 51.57309365272522s
t=28200, 9.869637727737427s
t=28400, 9.98726511001587s
t=28600, 10.011229753494263s
t=28800, 10.075086832046509s
t=29000, 10.116919994354248s
t=29200, 10.16980528831482s
t=29400, 10.055135250091553s
t=29600, 10.178759098052979s
t=29800, 11.240942239761353s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |             117 |
|          StdEpRet |            10.7 |
|          MaxEpRet |             128 |
|          MinEpRet |             107 |
|  AverageTestEpRet |              74 |
|      StdTestEpRet |             191 |
|      MaxTestEpRet |             171 |
|      MinTestEpRet |            -495 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |           -12.2 |
|         StdQ1Vals |            5.62 |
|         MaxQ1Vals |            17.5 |
|         MinQ1Vals |           -45.7 |
|     AverageQ2Vals |           -12

KeyboardInterrupt: 

In [12]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 6,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Share_HistMemory8Len_HidStateLayer0_2L128_NoFreeze_MLPHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Share_HistMemory8Len_HidStateLayer0_2L128_NoFreeze_MLPHist\lstm_td3_HalfCheetah_Share_HistMemory8Len_HidStateLayer0_2L128_NoFreeze_MLPHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x000002783BAA1F78>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory8Len_HidStateLayer0_2L128_NoFreeze_MLPHist",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000002783BCEC048>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory8Len_HidStateLayer0_2L128_NoFreeze_MLPHist",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
          

  result = entry_point.load(False)


t=800, 0.03989386558532715s
t=1000, 0.03690028190612793s
t=1200, 11.257975578308105s
t=1400, 9.977291822433472s
t=1600, 10.078081607818604s
t=1800, 11.046891212463379s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -440 |
|          StdEpRet |             111 |
|          MaxEpRet |            -329 |
|          MinEpRet |            -551 |
|  AverageTestEpRet |            -595 |
|      StdTestEpRet |           0.833 |
|      MaxTestEpRet |            -594 |
|      MinTestEpRet |            -597 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.489 |
|         StdQ1Vals |           0.888 |
|         MaxQ1Vals |            4.42 |
|         MinQ1Vals |           -4.06 |
|     AverageQ2Vals |          -0.489 |
|         StdQ2Vals |           0.887 |
|         MaxQ2Vals |            4.45 |
|         MinQ2Vals |           

t=14000, 50.506948947906494s
t=14200, 10.026189088821411s
t=14400, 10.089990377426147s
t=14600, 9.780914545059204s
t=14800, 9.617256164550781s
t=15000, 9.553453922271729s
t=15200, 9.520542621612549s
t=15400, 9.520569324493408s
t=15600, 9.624237060546875s
t=15800, 9.660168886184692s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |            -120 |
|          StdEpRet |            96.5 |
|          MaxEpRet |           -23.2 |
|          MinEpRet |            -216 |
|  AverageTestEpRet |            -234 |
|      StdTestEpRet |            97.1 |
|      MaxTestEpRet |           -12.4 |
|      MinTestEpRet |            -355 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |           -7.82 |
|         StdQ1Vals |               7 |
|         MaxQ1Vals |            30.8 |
|         MinQ1Vals |           -25.7 |
|     AverageQ2Vals |           -7.82

t=28000, 55.707035541534424s
t=28200, 9.824729442596436s
t=28400, 10.633593082427979s
t=28600, 11.227977275848389s


KeyboardInterrupt: 

In [618]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 10,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Share_HistMemory10Len_HidStateLayer0_2L128_NoFreeze_MLPHist_RandomHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Share_HistMemory10Len_HidStateLayer0_2L128_NoFreeze_MLPHist_RandomHist\lstm_td3_HalfCheetah_Share_HistMemory10Len_HidStateLayer0_2L128_NoFreeze_MLPHist_RandomHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000019A9E820828>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory10Len_HidStateLayer0_2L128_NoFreeze_MLPHist_RandomHist",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000019AB5D78F48>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory10Len_HidStateLayer0_2L128_NoFreeze_MLPHist_RandomHist",
            "first_row":	true,
            "log_current_row

t=8000, 27.014147758483887s
t=8200, 6.216366291046143s
t=8400, 6.389825344085693s
t=8600, 6.510246276855469s
t=8800, 6.3370490074157715s
t=9000, 6.650116205215454s
t=9200, 6.808303117752075s
t=9400, 6.490243196487427s
t=9600, 6.237384080886841s
t=9800, 6.335054874420166s
---------------------------------------
|             Epoch |               5 |
|      AverageEpRet |           -61.4 |
|          StdEpRet |             393 |
|          MaxEpRet |             332 |
|          MinEpRet |            -455 |
|  AverageTestEpRet |             630 |
|      StdTestEpRet |             431 |
|      MaxTestEpRet |             989 |
|      MinTestEpRet |            -391 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+04 |
|     AverageQ1Vals |           -1.02 |
|         StdQ1Vals |            5.02 |
|         MaxQ1Vals |            21.7 |
|         MinQ1Vals |             -13 |
|     AverageQ2Vals |           -1.02 |
|       

t=22000, 24.81163263320923s
t=22200, 5.838380813598633s
t=22400, 6.332062721252441s
t=22600, 5.970030307769775s
t=22800, 5.780539035797119s
t=23000, 5.771560192108154s
t=23200, 5.708729028701782s
t=23400, 5.9540746212005615s
t=23600, 5.74762487411499s
t=23800, 5.462388038635254s
---------------------------------------
|             Epoch |              12 |
|      AverageEpRet |             -36 |
|          StdEpRet |             237 |
|          MaxEpRet |             201 |
|          MinEpRet |            -273 |
|  AverageTestEpRet |             655 |
|      StdTestEpRet |             871 |
|      MaxTestEpRet |        1.78e+03 |
|      MinTestEpRet |            -353 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.4e+04 |
|     AverageQ1Vals |            6.25 |
|         StdQ1Vals |            22.6 |
|         MaxQ1Vals |            56.1 |
|         MinQ1Vals |           -42.7 |
|     AverageQ2Vals |            6.25 |


t=36000, 25.145740509033203s
t=36200, 6.008923530578613s
t=36400, 5.386590480804443s
t=36600, 5.69277286529541s
t=36800, 5.608996391296387s
t=37000, 6.062855243682861s
t=37200, 5.903214931488037s
t=37400, 5.905557632446289s
t=37600, 6.102888345718384s
t=37800, 6.101679563522339s
---------------------------------------
|             Epoch |              19 |
|      AverageEpRet |             696 |
|          StdEpRet |        1.13e+03 |
|          MaxEpRet |        1.83e+03 |
|          MinEpRet |            -435 |
|  AverageTestEpRet |         1.5e+03 |
|      StdTestEpRet |             729 |
|      MaxTestEpRet |        1.95e+03 |
|      MinTestEpRet |            -394 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.8e+04 |
|     AverageQ1Vals |            24.1 |
|         StdQ1Vals |            37.4 |
|         MaxQ1Vals |            86.8 |
|         MinQ1Vals |           -59.5 |
|     AverageQ2Vals |            24.1 |


t=50000, 25.94525694847107s
t=50200, 6.159747362136841s
t=50400, 6.679891586303711s
t=50600, 7.190765619277954s
t=50800, 6.646222114562988s
t=51000, 6.372179269790649s
t=51200, 6.237315654754639s
t=51400, 5.606004238128662s
t=51600, 5.862322807312012s
t=51800, 5.721689939498901s
---------------------------------------
|             Epoch |              26 |
|      AverageEpRet |           2e+03 |
|          StdEpRet |            69.4 |
|          MaxEpRet |        2.06e+03 |
|          MinEpRet |        1.93e+03 |
|  AverageTestEpRet |        1.91e+03 |
|      StdTestEpRet |             429 |
|      MaxTestEpRet |        2.19e+03 |
|      MinTestEpRet |             654 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.2e+04 |
|     AverageQ1Vals |            41.6 |
|         StdQ1Vals |            49.2 |
|         MaxQ1Vals |             113 |
|         MinQ1Vals |           -76.2 |
|     AverageQ2Vals |            41.6 |


t=64000, 26.963692903518677s
t=64200, 6.050525903701782s
t=64400, 5.8569416999816895s
t=64600, 6.681128740310669s
t=64800, 5.91717267036438s
t=65000, 5.8353893756866455s
t=65200, 5.996959447860718s
t=65400, 5.809460163116455s
t=65600, 5.573091983795166s
t=65800, 5.869300127029419s
---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |        1.98e+03 |
|          StdEpRet |            22.4 |
|          MaxEpRet |           2e+03 |
|          MinEpRet |        1.96e+03 |
|  AverageTestEpRet |        2.26e+03 |
|      StdTestEpRet |            78.8 |
|      MaxTestEpRet |        2.37e+03 |
|      MinTestEpRet |        2.07e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.6e+04 |
|     AverageQ1Vals |            59.7 |
|         StdQ1Vals |            53.1 |
|         MaxQ1Vals |             127 |
|         MinQ1Vals |           -85.8 |
|     AverageQ2Vals |            59.7 

t=78000, 30.582638025283813s
t=78200, 6.5322585105896s
t=78400, 6.9836180210113525s
t=78600, 7.87890625s
t=78800, 6.6881103515625s
t=79000, 7.080795049667358s
t=79200, 6.144564867019653s
t=79400, 5.997447490692139s
t=79600, 6.287182331085205s
t=79800, 6.556461811065674s
---------------------------------------
|             Epoch |              40 |
|      AverageEpRet |         2.2e+03 |
|          StdEpRet |             140 |
|          MaxEpRet |        2.34e+03 |
|          MinEpRet |        2.06e+03 |
|  AverageTestEpRet |        2.27e+03 |
|      StdTestEpRet |             222 |
|      MaxTestEpRet |        2.47e+03 |
|      MinTestEpRet |        1.65e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           8e+04 |
|     AverageQ1Vals |            72.5 |
|         StdQ1Vals |            57.5 |
|         MaxQ1Vals |             140 |
|         MinQ1Vals |           -94.9 |
|     AverageQ2Vals |            72.5 |
|        

t=92000, 27.13242483139038s
t=92200, 5.93512225151062s
t=92400, 6.092702388763428s
t=92600, 6.2612526416778564s
t=92800, 6.50659441947937s
t=93000, 7.321416139602661s
t=93200, 7.67047643661499s
t=93400, 6.668164253234863s
t=93600, 7.25459361076355s
t=93800, 6.524547100067139s
---------------------------------------
|             Epoch |              47 |
|      AverageEpRet |        2.39e+03 |
|          StdEpRet |            7.83 |
|          MaxEpRet |         2.4e+03 |
|          MinEpRet |        2.38e+03 |
|  AverageTestEpRet |         2.2e+03 |
|      StdTestEpRet |             640 |
|      MaxTestEpRet |        2.74e+03 |
|      MinTestEpRet |        1.06e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         9.4e+04 |
|     AverageQ1Vals |            85.3 |
|         StdQ1Vals |            61.2 |
|         MaxQ1Vals |             157 |
|         MinQ1Vals |            -104 |
|     AverageQ2Vals |            85.3 |
|  

In [9]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Share_HistMemory5Len_HidStateLayer0_2L128_NoFreeze_MLPHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Share_HistMemory5Len_HidStateLayer0_2L128_NoFreeze_MLPHist\lstm_td3_HalfCheetah_Share_HistMemory5Len_HidStateLayer0_2L128_NoFreeze_MLPHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x000002783EA0CC18>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory5Len_HidStateLayer0_2L128_NoFreeze_MLPHist",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000002783BA520C8>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory5Len_HidStateLayer0_2L128_NoFreeze_MLPHist",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
          

t=8000, 65.81658816337585s
t=8200, 12.820717334747314s
t=8400, 12.90947961807251s
t=8600, 13.33135437965393s
t=8800, 13.306416273117065s
t=9000, 13.03315019607544s
t=9200, 12.851634502410889s
t=9400, 12.912471055984497s
t=9600, 13.121912240982056s
t=9800, 12.884547472000122s
---------------------------------------
|             Epoch |               5 |
|      AverageEpRet |             621 |
|          StdEpRet |            48.6 |
|          MaxEpRet |             670 |
|          MinEpRet |             573 |
|  AverageTestEpRet |             444 |
|      StdTestEpRet |             392 |
|      MaxTestEpRet |             769 |
|      MinTestEpRet |            -514 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+04 |
|     AverageQ1Vals |            1.28 |
|         StdQ1Vals |            5.87 |
|         MaxQ1Vals |            18.3 |
|         MinQ1Vals |           -28.1 |
|     AverageQ2Vals |            1.28 |
|   

t=22000, 61.39406871795654s
t=22200, 11.990908861160278s
t=22400, 11.847321271896362s
t=22600, 11.875244617462158s
t=22800, 12.893754005432129s
t=23000, 11.99967646598816s
t=23200, 12.309169292449951s
t=23400, 13.042644500732422s
t=23600, 13.279489994049072s
t=23800, 13.497906684875488s
---------------------------------------
|             Epoch |              12 |
|      AverageEpRet |             365 |
|          StdEpRet |             308 |
|          MaxEpRet |             673 |
|          MinEpRet |            57.1 |
|  AverageTestEpRet |             732 |
|      StdTestEpRet |             122 |
|      MaxTestEpRet |             931 |
|      MinTestEpRet |             495 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.4e+04 |
|     AverageQ1Vals |            16.2 |
|         StdQ1Vals |            11.8 |
|         MaxQ1Vals |              36 |
|         MinQ1Vals |           -47.1 |
|     AverageQ2Vals |           

t=36000, 59.16484189033508s
t=36200, 12.098599433898926s
t=36400, 11.933091640472412s
t=36600, 12.071720838546753s
t=36800, 11.91613483428955s
t=37000, 11.779501676559448s
t=37200, 11.97597622871399s
t=37400, 11.704701662063599s
t=37600, 12.323047637939453s
t=37800, 12.593326330184937s
---------------------------------------
|             Epoch |              19 |
|      AverageEpRet |        1.18e+03 |
|          StdEpRet |             331 |
|          MaxEpRet |        1.51e+03 |
|          MinEpRet |             849 |
|  AverageTestEpRet |             978 |
|      StdTestEpRet |             227 |
|      MaxTestEpRet |        1.46e+03 |
|      MinTestEpRet |             702 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.8e+04 |
|     AverageQ1Vals |            22.4 |
|         StdQ1Vals |            17.7 |
|         MaxQ1Vals |            46.3 |
|         MinQ1Vals |           -71.5 |
|     AverageQ2Vals |            

t=50000, 59.55577540397644s
t=50200, 12.533460140228271s
t=50400, 12.608283042907715s
t=50600, 12.623245239257812s
t=50800, 12.474643230438232s
t=51000, 12.520519971847534s
t=51200, 12.281158685684204s
t=51400, 12.582356214523315s
t=51600, 12.128567934036255s
t=51800, 12.348978042602539s
---------------------------------------
|             Epoch |              26 |
|      AverageEpRet |         1.8e+03 |
|          StdEpRet |            40.3 |
|          MaxEpRet |        1.84e+03 |
|          MinEpRet |        1.76e+03 |
|  AverageTestEpRet |        1.88e+03 |
|      StdTestEpRet |             484 |
|      MaxTestEpRet |        2.67e+03 |
|      MinTestEpRet |        1.12e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.2e+04 |
|     AverageQ1Vals |            38.3 |
|         StdQ1Vals |            25.9 |
|         MaxQ1Vals |             116 |
|         MinQ1Vals |           -67.8 |
|     AverageQ2Vals |          

t=64000, 62.551748752593994s
t=64200, 12.154497385025024s
t=64400, 12.127571105957031s
t=64600, 12.880557537078857s
t=64800, 12.407826662063599s
t=65000, 12.456686973571777s
t=65200, 12.530492544174194s
t=65400, 13.615591049194336s
t=65600, 13.24458646774292s
t=65800, 12.916459560394287s
---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |        2.57e+03 |
|          StdEpRet |             543 |
|          MaxEpRet |        3.11e+03 |
|          MinEpRet |        2.02e+03 |
|  AverageTestEpRet |         3.1e+03 |
|      StdTestEpRet |             103 |
|      MaxTestEpRet |         3.3e+03 |
|      MinTestEpRet |        2.88e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.6e+04 |
|     AverageQ1Vals |            55.5 |
|         StdQ1Vals |            36.7 |
|         MaxQ1Vals |             148 |
|         MinQ1Vals |           -71.2 |
|     AverageQ2Vals |          

KeyboardInterrupt: 

In [599]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 0,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Share_HistMemory0Len_HidStateLayer0_2L128_NoFreeze_MLPHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Share_HistMemory0Len_HidStateLayer0_2L128_NoFreeze_MLPHist\lstm_td3_HalfCheetah_Share_HistMemory0Len_HidStateLayer0_2L128_NoFreeze_MLPHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000019A9ED3F948>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory0Len_HidStateLayer0_2L128_NoFreeze_MLPHist",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000019A9524CA88>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory0Len_HidStateLayer0_2L128_NoFreeze_MLPHist",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
          

t=8000, 65.4758095741272s
t=8200, 12.275164365768433s
t=8400, 12.189394235610962s
t=8600, 12.004888534545898s
t=8800, 12.724963426589966s
t=9000, 12.470639944076538s
t=9200, 12.096642255783081s
t=9400, 13.187723398208618s
t=9600, 11.552099466323853s
t=9800, 11.868253469467163s
---------------------------------------
|             Epoch |               5 |
|      AverageEpRet |           -1.41 |
|          StdEpRet |           0.189 |
|          MaxEpRet |           -1.22 |
|          MinEpRet |            -1.6 |
|  AverageTestEpRet |             -36 |
|      StdTestEpRet |            53.7 |
|      MaxTestEpRet |              56 |
|      MinTestEpRet |            -155 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+04 |
|     AverageQ1Vals |           -1.36 |
|         StdQ1Vals |            3.83 |
|         MaxQ1Vals |            22.3 |
|         MinQ1Vals |           -12.5 |
|     AverageQ2Vals |           -1.36 |
| 

t=22000, 62.434993505477905s
t=22200, 27134.287494659424s
t=22400, 13.545396327972412s
t=22600, 10.3722505569458s
t=22800, 10.074057817459106s
t=23000, 9.752906560897827s
t=23200, 9.88056993484497s
t=23400, 10.795138835906982s
t=23600, 10.99861454963684s
t=23800, 9.884559392929077s
---------------------------------------
|             Epoch |              12 |
|      AverageEpRet |        2.02e+03 |
|          StdEpRet |             131 |
|          MaxEpRet |        2.15e+03 |
|          MinEpRet |        1.89e+03 |
|  AverageTestEpRet |        1.85e+03 |
|      StdTestEpRet |             533 |
|      MaxTestEpRet |        2.42e+03 |
|      MinTestEpRet |             650 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.4e+04 |
|     AverageQ1Vals |            15.9 |
|         StdQ1Vals |            14.3 |
|         MaxQ1Vals |            65.5 |
|         MinQ1Vals |           -8.85 |
|     AverageQ2Vals |            15.9

t=36000, 78.24266481399536s
t=36200, 13.606411218643188s
t=36400, 13.49739956855774s
t=36600, 13.397964477539062s
t=36800, 13.296416759490967s
t=37000, 14.436954975128174s
t=37200, 13.850140810012817s
t=37400, 14.781585931777954s
t=37600, 14.10844612121582s
t=37800, 12.60640287399292s
---------------------------------------
|             Epoch |              19 |
|      AverageEpRet |        2.84e+03 |
|          StdEpRet |             124 |
|          MaxEpRet |        2.96e+03 |
|          MinEpRet |        2.71e+03 |
|  AverageTestEpRet |        2.55e+03 |
|      StdTestEpRet |             719 |
|      MaxTestEpRet |        3.33e+03 |
|      MinTestEpRet |        1.14e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.8e+04 |
|     AverageQ1Vals |            54.1 |
|         StdQ1Vals |            34.9 |
|         MaxQ1Vals |             130 |
|         MinQ1Vals |            2.09 |
|     AverageQ2Vals |            5

t=50000, 73.7058846950531s
t=50200, 13.699355363845825s
t=50400, 13.90879487991333s
t=50600, 13.623558044433594s
t=50800, 15.235785245895386s
t=51000, 14.470292568206787s
t=51200, 16.143816709518433s
t=51400, 14.112250804901123s
t=51600, 13.254545450210571s
t=51800, 13.528810977935791s
---------------------------------------
|             Epoch |              26 |
|      AverageEpRet |        3.41e+03 |
|          StdEpRet |            7.32 |
|          MaxEpRet |        3.41e+03 |
|          MinEpRet |         3.4e+03 |
|  AverageTestEpRet |        3.76e+03 |
|      StdTestEpRet |             154 |
|      MaxTestEpRet |        3.95e+03 |
|      MinTestEpRet |        3.45e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.2e+04 |
|     AverageQ1Vals |            95.1 |
|         StdQ1Vals |            52.8 |
|         MaxQ1Vals |             190 |
|         MinQ1Vals |            9.24 |
|     AverageQ2Vals |            

KeyboardInterrupt: 

In [594]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 10,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Share_HistMemory10Len_HidStateLayer0_2L128_NoFreeze_MLPHist'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Share_HistMemory10Len_HidStateLayer0_2L128_NoFreeze_MLPHist\lstm_td3_HalfCheetah_Share_HistMemory10Len_HidStateLayer0_2L128_NoFreeze_MLPHist_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000019AA08A4F78>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory10Len_HidStateLayer0_2L128_NoFreeze_MLPHist",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000019A9E877608>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory10Len_HidStateLayer0_2L128_NoFreeze_MLPHist",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
      

  result = entry_point.load(False)


t=600, 0.05485367774963379s
t=800, 0.04388284683227539s
t=1000, 0.028923511505126953s
t=1200, 14.575966358184814s
t=1400, 12.216326475143433s
t=1600, 11.923075199127197s
t=1800, 13.022460699081421s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -389 |
|          StdEpRet |             163 |
|          MaxEpRet |            -226 |
|          MinEpRet |            -552 |
|  AverageTestEpRet |            -594 |
|      StdTestEpRet |           0.556 |
|      MaxTestEpRet |            -593 |
|      MinTestEpRet |            -594 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.378 |
|         StdQ1Vals |           0.817 |
|         MaxQ1Vals |             4.5 |
|         MinQ1Vals |           -4.29 |
|     AverageQ2Vals |          -0.378 |
|         StdQ2Vals |           0.816 |
|         MaxQ2Vals |            4.58 |
| 

t=14000, 67.18129801750183s
t=14200, 13.624557733535767s
t=14400, 12.431769371032715s
t=14600, 12.610242128372192s
t=14800, 13.555745124816895s
t=15000, 15.88293981552124s
t=15200, 12.765456199645996s
t=15400, 12.72795033454895s
t=15600, 12.293089389801025s
t=15800, 13.469967603683472s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |            -550 |
|          StdEpRet |        0.000947 |
|          MaxEpRet |            -550 |
|          MinEpRet |            -550 |
|  AverageTestEpRet |            -594 |
|      StdTestEpRet |            1.19 |
|      MaxTestEpRet |            -592 |
|      MinTestEpRet |            -596 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |           -14.8 |
|         StdQ1Vals |             2.1 |
|         MaxQ1Vals |            13.6 |
|         MinQ1Vals |           -26.2 |
|     AverageQ2Vals |           -

t=28000, 68.21922159194946s
t=28200, 12.052451372146606s
t=28400, 12.49778437614441s
t=28600, 12.462159872055054s
t=28800, 12.675599813461304s
t=29000, 12.650688648223877s
t=29200, 12.447617292404175s
t=29400, 12.52827262878418s
t=29600, 12.831859827041626s
t=29800, 11.854166507720947s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |            -550 |
|          StdEpRet |            1.12 |
|          MaxEpRet |            -549 |
|          MinEpRet |            -551 |
|  AverageTestEpRet |            -593 |
|      StdTestEpRet |           0.903 |
|      MaxTestEpRet |            -592 |
|      MinTestEpRet |            -594 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |           -25.7 |
|         StdQ1Vals |            1.79 |
|         MaxQ1Vals |            8.11 |
|         MinQ1Vals |             -39 |
|     AverageQ2Vals |           -

t=42000, 83.45003890991211s
t=42200, 14.08431625366211s
t=42400, 14.579598188400269s
t=42600, 15.207722425460815s
t=42800, 16.32173728942871s
t=43000, 21.035690784454346s
t=43200, 17.730626821517944s
t=43400, 16.502655744552612s
t=43600, 14.451582431793213s
t=43800, 13.923625469207764s
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |            -551 |
|          StdEpRet |           0.164 |
|          MaxEpRet |            -551 |
|          MinEpRet |            -551 |
|  AverageTestEpRet |            -594 |
|      StdTestEpRet |            1.01 |
|      MaxTestEpRet |            -592 |
|      MinTestEpRet |            -595 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |           -33.3 |
|         StdQ1Vals |            1.58 |
|         MaxQ1Vals |          -0.105 |
|         MinQ1Vals |           -49.5 |
|     AverageQ2Vals |           -

t=56000, 68.21736598014832s
t=56200, 14.24486780166626s
t=56400, 12.637224674224854s
t=56600, 14.311717987060547s
t=56800, 13.309457778930664s
t=57000, 13.265515804290771s
t=57200, 12.193417072296143s
t=57400, 12.322036027908325s
t=57600, 12.472634553909302s
t=57800, 13.464725971221924s
---------------------------------------
|             Epoch |              29 |
|      AverageEpRet |            -550 |
|          StdEpRet |           0.504 |
|          MaxEpRet |            -549 |
|          MinEpRet |            -550 |
|  AverageTestEpRet |            -594 |
|      StdTestEpRet |            1.11 |
|      MaxTestEpRet |            -593 |
|      MinTestEpRet |            -596 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.8e+04 |
|     AverageQ1Vals |           -38.7 |
|         StdQ1Vals |            1.41 |
|         MaxQ1Vals |           -7.01 |
|         MinQ1Vals |           -60.7 |
|     AverageQ2Vals |           

t=70000, 73.85998892784119s
t=70200, 13.843003034591675s
t=70400, 13.473756551742554s
t=70600, 13.37320852279663s
t=70800, 13.932752847671509s
t=71000, 14.25229787826538s
t=71200, 12.741950750350952s
t=71400, 11.945033550262451s
t=71600, 12.23328948020935s
t=71800, 12.577325105667114s
---------------------------------------
|             Epoch |              36 |
|      AverageEpRet |            -549 |
|          StdEpRet |            1.19 |
|          MaxEpRet |            -547 |
|          MinEpRet |            -550 |
|  AverageTestEpRet |            -594 |
|      StdTestEpRet |            1.11 |
|      MaxTestEpRet |            -592 |
|      MinTestEpRet |            -597 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         7.2e+04 |
|     AverageQ1Vals |           -42.4 |
|         StdQ1Vals |            1.45 |
|         MaxQ1Vals |           -10.7 |
|         MinQ1Vals |           -66.8 |
|     AverageQ2Vals |           -4

t=84000, 68.31161069869995s
t=84200, 13.152786254882812s
t=84400, 12.296133518218994s
t=84600, 12.970460176467896s
t=84800, 12.25025463104248s
t=85000, 12.330993890762329s
t=85200, 12.482075452804565s
t=85400, 12.449660301208496s
t=85600, 12.271149635314941s
t=85800, 12.688057899475098s
---------------------------------------
|             Epoch |              43 |
|      AverageEpRet |            -551 |
|          StdEpRet |           0.403 |
|          MaxEpRet |            -550 |
|          MinEpRet |            -551 |
|  AverageTestEpRet |            -595 |
|      StdTestEpRet |             1.2 |
|      MaxTestEpRet |            -592 |
|      MinTestEpRet |            -596 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         8.6e+04 |
|     AverageQ1Vals |           -45.1 |
|         StdQ1Vals |            1.53 |
|         MaxQ1Vals |           -13.2 |
|         MinQ1Vals |           -71.6 |
|     AverageQ2Vals |           

t=98000, 67.10530805587769s
t=98200, 11.932082891464233s
t=98400, 12.023842096328735s
t=98600, 11.964961290359497s
t=98800, 12.85464859008789s
t=99000, 12.261265516281128s
t=99200, 12.599265336990356s
t=99400, 12.052761793136597s
t=99600, 12.440743923187256s
t=99800, 14.158103942871094s
---------------------------------------
|             Epoch |              50 |
|      AverageEpRet |            -549 |
|          StdEpRet |           0.971 |
|          MaxEpRet |            -549 |
|          MinEpRet |            -550 |
|  AverageTestEpRet |            -594 |
|      StdTestEpRet |           0.913 |
|      MaxTestEpRet |            -592 |
|      MinTestEpRet |            -595 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+05 |
|     AverageQ1Vals |             -47 |
|         StdQ1Vals |            1.52 |
|         MaxQ1Vals |             -12 |
|         MinQ1Vals |           -75.4 |
|     AverageQ2Vals |           

In [586]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 10,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Share_HistMemory10Len_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Share_HistMemory10Len_HidStateLayer0_2L128_NoFreeze\lstm_td3_HalfCheetah_Share_HistMemory10Len_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000019AB9E251F8>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory10Len_HidStateLayer0_2L128_NoFreeze",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000019AD2010508>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Share_HistMemory10Len_HidStateLayer0_2L128_NoFreeze",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\

  result = entry_point.load(False)


t=1000, 0.030948877334594727s
t=1200, 13.396913528442383s
t=1400, 12.99985408782959s
t=1600, 11.602367162704468s
t=1800, 12.701417446136475s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -394 |
|          StdEpRet |            82.2 |
|          MaxEpRet |            -312 |
|          MinEpRet |            -476 |
|  AverageTestEpRet |            -478 |
|      StdTestEpRet |            2.73 |
|      MaxTestEpRet |            -471 |
|      MinTestEpRet |            -481 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.668 |
|         StdQ1Vals |           0.813 |
|         MaxQ1Vals |            3.05 |
|         MinQ1Vals |           -4.38 |
|     AverageQ2Vals |          -0.671 |
|         StdQ2Vals |           0.815 |
|         MaxQ2Vals |            2.93 |
|         MinQ2Vals |           -4.42 |
|            LossPi

t=14000, 75.04860091209412s
t=14200, 12.916218280792236s
t=14400, 13.448941469192505s
t=14600, 13.29253888130188s
t=14800, 13.269338130950928s
t=15000, 13.321585655212402s
t=15200, 13.337301969528198s
t=15400, 14.660050392150879s
t=15600, 13.203516244888306s
t=15800, 13.518590688705444s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |             727 |
|          StdEpRet |             164 |
|          MaxEpRet |             891 |
|          MinEpRet |             564 |
|  AverageTestEpRet |             897 |
|      StdTestEpRet |            85.7 |
|      MaxTestEpRet |             987 |
|      MinTestEpRet |             713 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |            9.24 |
|         StdQ1Vals |            3.92 |
|         MaxQ1Vals |            24.3 |
|         MinQ1Vals |           -7.03 |
|     AverageQ2Vals |           

t=28000, 69.7678816318512s
t=28200, 12.93237829208374s
t=28400, 13.36029577255249s
t=28600, 12.634171485900879s
t=28800, 13.247592449188232s
t=29000, 12.677062034606934s
t=29200, 13.659461736679077s
t=29400, 13.64855146408081s
t=29600, 12.21829104423523s
t=29800, 12.745931386947632s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |        1.09e+03 |
|          StdEpRet |            44.5 |
|          MaxEpRet |        1.13e+03 |
|          MinEpRet |        1.04e+03 |
|  AverageTestEpRet |        1.12e+03 |
|      StdTestEpRet |            40.2 |
|      MaxTestEpRet |        1.19e+03 |
|      MinTestEpRet |        1.08e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |            31.5 |
|         StdQ1Vals |            5.65 |
|         MaxQ1Vals |            55.1 |
|         MinQ1Vals |            5.49 |
|     AverageQ2Vals |            31.

t=42000, 70.21720051765442s
t=42200, 12.724964618682861s
t=42400, 12.707050561904907s
t=42600, 13.40314793586731s
t=42800, 12.67509388923645s
t=43000, 12.419806241989136s
t=43200, 12.365922212600708s
t=43400, 12.577359676361084s
t=43600, 12.3319833278656s
t=43800, 12.595307111740112s
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |        1.15e+03 |
|          StdEpRet |              83 |
|          MaxEpRet |        1.24e+03 |
|          MinEpRet |        1.07e+03 |
|  AverageTestEpRet |        1.25e+03 |
|      StdTestEpRet |            66.3 |
|      MaxTestEpRet |        1.35e+03 |
|      MinTestEpRet |        1.17e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |            50.4 |
|         StdQ1Vals |            5.88 |
|         MaxQ1Vals |            74.6 |
|         MinQ1Vals |            21.7 |
|     AverageQ2Vals |            50

t=56000, 65.12479519844055s
t=56200, 14.05540156364441s
t=56400, 13.066084623336792s
t=56600, 12.26219391822815s
t=56800, 12.317026138305664s
t=57000, 13.170769214630127s
t=57200, 13.808096408843994s
t=57400, 12.438692569732666s
t=57600, 12.242282629013062s
t=57800, 12.712964296340942s
---------------------------------------
|             Epoch |              29 |
|      AverageEpRet |         1.2e+03 |
|          StdEpRet |            16.1 |
|          MaxEpRet |        1.22e+03 |
|          MinEpRet |        1.19e+03 |
|  AverageTestEpRet |        1.25e+03 |
|      StdTestEpRet |            54.9 |
|      MaxTestEpRet |        1.35e+03 |
|      MinTestEpRet |        1.18e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.8e+04 |
|     AverageQ1Vals |            64.1 |
|         StdQ1Vals |            5.81 |
|         MaxQ1Vals |            83.1 |
|         MinQ1Vals |            30.8 |
|     AverageQ2Vals |            

t=70000, 63.882086992263794s
t=70200, 12.682111740112305s
t=70400, 13.520797729492188s
t=70600, 13.011196851730347s
t=70800, 12.217353582382202s
t=71000, 12.290088415145874s
t=71200, 12.67611289024353s
t=71400, 12.021825790405273s
t=71600, 12.078687906265259s
t=71800, 12.332014799118042s
---------------------------------------
|             Epoch |              36 |
|      AverageEpRet |        1.27e+03 |
|          StdEpRet |            31.3 |
|          MaxEpRet |         1.3e+03 |
|          MinEpRet |        1.23e+03 |
|  AverageTestEpRet |         1.3e+03 |
|      StdTestEpRet |            38.5 |
|      MaxTestEpRet |        1.34e+03 |
|      MinTestEpRet |        1.23e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         7.2e+04 |
|     AverageQ1Vals |            73.8 |
|         StdQ1Vals |             6.8 |
|         MaxQ1Vals |            91.9 |
|         MinQ1Vals |            28.9 |
|     AverageQ2Vals |          

t=84000, 66.4801676273346s
t=84200, 12.250223875045776s
t=84400, 12.839632987976074s
t=84600, 12.474663496017456s
t=84800, 14.348587274551392s
t=85000, 14.60923171043396s
t=85200, 13.14683747291565s
t=85400, 18.8659188747406s
t=85600, 14.372296571731567s
t=85800, 13.829496145248413s
---------------------------------------
|             Epoch |              43 |
|      AverageEpRet |        1.34e+03 |
|          StdEpRet |            3.21 |
|          MaxEpRet |        1.34e+03 |
|          MinEpRet |        1.34e+03 |
|  AverageTestEpRet |        1.35e+03 |
|      StdTestEpRet |              58 |
|      MaxTestEpRet |        1.43e+03 |
|      MinTestEpRet |        1.21e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         8.6e+04 |
|     AverageQ1Vals |            81.9 |
|         StdQ1Vals |            7.53 |
|         MaxQ1Vals |            99.4 |
|         MinQ1Vals |            15.8 |
|     AverageQ2Vals |            81.

t=98000, 66.03817415237427s
t=98200, 13.76916790008545s
t=98400, 13.47498607635498s
t=98600, 12.452691793441772s
t=98800, 12.055747747421265s
t=99000, 12.491560697555542s
t=99200, 11.936103582382202s
t=99400, 12.086667776107788s
t=99600, 12.501561164855957s
t=99800, 12.442710399627686s
---------------------------------------
|             Epoch |              50 |
|      AverageEpRet |        1.37e+03 |
|          StdEpRet |            5.57 |
|          MaxEpRet |        1.38e+03 |
|          MinEpRet |        1.37e+03 |
|  AverageTestEpRet |        1.39e+03 |
|      StdTestEpRet |            49.6 |
|      MaxTestEpRet |        1.47e+03 |
|      MinTestEpRet |        1.34e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+05 |
|     AverageQ1Vals |            88.8 |
|         StdQ1Vals |            7.71 |
|         MaxQ1Vals |             105 |
|         MinQ1Vals |            12.7 |
|     AverageQ2Vals |            

In [584]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 10,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_NoShare_HistMemory10Len_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_NoShare_HistMemory10Len_HidStateLayer0_2L128_NoFreeze\lstm_td3_HalfCheetah_NoShare_HistMemory10Len_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000019A952BB438>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_NoShare_HistMemory10Len_HidStateLayer0_2L128_NoFreeze",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000019AA0D2C088>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_NoShare_HistMemory10Len_HidStateLayer0_2L128_NoFreeze",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\

  result = entry_point.load(False)


t=1200, 14.852272510528564s
t=1400, 13.489915132522583s
t=1600, 12.20737099647522s
t=1800, 12.542704820632935s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -409 |
|          StdEpRet |             126 |
|          MaxEpRet |            -283 |
|          MinEpRet |            -536 |
|  AverageTestEpRet |            -589 |
|      StdTestEpRet |           0.856 |
|      MaxTestEpRet |            -589 |
|      MinTestEpRet |            -591 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.311 |
|         StdQ1Vals |           0.819 |
|         MaxQ1Vals |            3.61 |
|         MinQ1Vals |           -2.59 |
|     AverageQ2Vals |          -0.312 |
|         StdQ2Vals |            0.82 |
|         MaxQ2Vals |            3.62 |
|         MinQ2Vals |           -2.55 |
|            LossPi |          -0.262 |
|        

t=14000, 68.45121693611145s
t=14200, 12.734263896942139s
t=14400, 12.550921201705933s
t=14600, 14.251153469085693s
t=14800, 13.591660261154175s
t=15000, 13.231027603149414s
t=15200, 13.051060199737549s
t=15400, 14.395217657089233s
t=15600, 13.548591375350952s
t=15800, 12.60037112236023s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |            -158 |
|          StdEpRet |            43.2 |
|          MaxEpRet |            -115 |
|          MinEpRet |            -201 |
|  AverageTestEpRet |             202 |
|      StdTestEpRet |             537 |
|      MaxTestEpRet |             896 |
|      MinTestEpRet |            -467 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |           -2.46 |
|         StdQ1Vals |            10.1 |
|         MaxQ1Vals |            30.7 |
|         MinQ1Vals |             -28 |
|     AverageQ2Vals |           

t=28000, 74.39319157600403s
t=28200, 14.356443643569946s
t=28400, 13.094177007675171s
t=28600, 14.225358963012695s
t=28800, 13.835674047470093s
t=29000, 12.442780017852783s
t=29200, 13.337114572525024s
t=29400, 13.788897037506104s
t=29600, 12.4425528049469s
t=29800, 12.527473211288452s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |             964 |
|          StdEpRet |            86.6 |
|          MaxEpRet |        1.05e+03 |
|          MinEpRet |             878 |
|  AverageTestEpRet |        1.04e+03 |
|      StdTestEpRet |             122 |
|      MaxTestEpRet |        1.25e+03 |
|      MinTestEpRet |             870 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |            6.75 |
|         StdQ1Vals |            20.1 |
|         MaxQ1Vals |            55.8 |
|         MinQ1Vals |           -34.4 |
|     AverageQ2Vals |            

t=42000, 67.39270901679993s
t=42200, 13.252628564834595s
t=42400, 12.381061792373657s
t=42600, 12.581313133239746s
t=42800, 12.153529644012451s
t=43000, 14.000512599945068s
t=43200, 13.614780902862549s
t=43400, 12.622209787368774s
t=43600, 12.965803384780884s
t=43800, 12.865015983581543s
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |        1.05e+03 |
|          StdEpRet |              24 |
|          MaxEpRet |        1.07e+03 |
|          MinEpRet |        1.03e+03 |
|  AverageTestEpRet |        1.16e+03 |
|      StdTestEpRet |            50.7 |
|      MaxTestEpRet |        1.24e+03 |
|      MinTestEpRet |        1.08e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |            22.3 |
|         StdQ1Vals |            28.3 |
|         MaxQ1Vals |            70.6 |
|         MinQ1Vals |           -43.4 |
|     AverageQ2Vals |          

t=56000, 69.7299222946167s
t=56200, 13.136749505996704s
t=56400, 12.864033699035645s
t=56600, 12.595314502716064s
t=56800, 12.6361985206604s
t=57000, 12.596324682235718s
t=57200, 14.645804643630981s
t=57400, 13.647494554519653s
t=57600, 12.964625597000122s
t=57800, 12.658113956451416s
---------------------------------------
|             Epoch |              29 |
|      AverageEpRet |        1.12e+03 |
|          StdEpRet |            85.9 |
|          MaxEpRet |         1.2e+03 |
|          MinEpRet |        1.03e+03 |
|  AverageTestEpRet |        1.14e+03 |
|      StdTestEpRet |            74.5 |
|      MaxTestEpRet |        1.31e+03 |
|      MinTestEpRet |        1.02e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         5.8e+04 |
|     AverageQ1Vals |            35.2 |
|         StdQ1Vals |            31.2 |
|         MaxQ1Vals |            83.2 |
|         MinQ1Vals |           -47.3 |
|     AverageQ2Vals |            3

t=70000, 65.2222912311554s
t=70200, 12.780784368515015s
t=70400, 12.289129257202148s
t=70600, 12.364009380340576s
t=70800, 12.240289211273193s
t=71000, 13.832994937896729s
t=71200, 13.633514165878296s
t=71400, 11.87825345993042s
t=71600, 12.650164127349854s
t=71800, 12.475638151168823s
---------------------------------------
|             Epoch |              36 |
|      AverageEpRet |        1.74e+03 |
|          StdEpRet |            8.23 |
|          MaxEpRet |        1.75e+03 |
|          MinEpRet |        1.73e+03 |
|  AverageTestEpRet |        1.63e+03 |
|      StdTestEpRet |             123 |
|      MaxTestEpRet |        1.82e+03 |
|      MinTestEpRet |         1.4e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         7.2e+04 |
|     AverageQ1Vals |            43.9 |
|         StdQ1Vals |            31.3 |
|         MaxQ1Vals |            95.5 |
|         MinQ1Vals |           -50.2 |
|     AverageQ2Vals |            

t=84000, 65.18293762207031s
t=84200, 12.512947082519531s
t=84400, 13.262117147445679s
t=84600, 13.552468538284302s
t=84800, 15.250809907913208s
t=85000, 12.603317260742188s
t=85200, 12.030786991119385s
t=85400, 13.971612930297852s
t=85600, 13.352229118347168s
t=85800, 12.476593732833862s
---------------------------------------
|             Epoch |              43 |
|      AverageEpRet |        1.71e+03 |
|          StdEpRet |            83.5 |
|          MaxEpRet |        1.79e+03 |
|          MinEpRet |        1.62e+03 |
|  AverageTestEpRet |        1.95e+03 |
|      StdTestEpRet |             143 |
|      MaxTestEpRet |         2.2e+03 |
|      MinTestEpRet |        1.77e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         8.6e+04 |
|     AverageQ1Vals |              53 |
|         StdQ1Vals |            34.2 |
|         MaxQ1Vals |             118 |
|         MinQ1Vals |           -54.5 |
|     AverageQ2Vals |          

KeyboardInterrupt: 

In [583]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 10,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_NoShare_HistMemory10Len_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_NoShare_HistMemory10Len_HidStateLayer0_2L128_NoFreeze\lstm_td3_HalfCheetah_NoShare_HistMemory10Len_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000019A957454C8>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_NoShare_HistMemory10Len_HidStateLayer0_2L128_NoFreeze",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000019A95287E48>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_NoShare_HistMemory10Len_HidStateLayer0_2L128_NoFreeze",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\

t=8000, 74.81982374191284s
t=8200, 13.47153615951538s
t=8400, 13.631694555282593s
t=8600, 14.007148027420044s
t=8800, 12.388357639312744s
t=9000, 12.251749753952026s
t=9200, 13.414091348648071s
t=9400, 13.103015661239624s
t=9600, 14.376513004302979s
t=9800, 12.7927827835083s
---------------------------------------
|             Epoch |               5 |
|      AverageEpRet |            -302 |
|          StdEpRet |            1.37 |
|          MaxEpRet |            -301 |
|          MinEpRet |            -304 |
|  AverageTestEpRet |            -355 |
|      StdTestEpRet |            41.2 |
|      MaxTestEpRet |            -307 |
|      MinTestEpRet |            -430 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           1e+04 |
|     AverageQ1Vals |           -1.41 |
|         StdQ1Vals |             3.8 |
|         MaxQ1Vals |            15.3 |
|         MinQ1Vals |           -17.2 |
|     AverageQ2Vals |           -1.41 |
|   

KeyboardInterrupt: 

In [569]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 10,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_NoShare_HistMemory0Len_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_NoShare_HistMemory0Len_HidStateLayer0_2L128_NoFreeze\lstm_td3_HalfCheetah_NoShare_HistMemory0Len_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000019AA3AAC1F8>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_NoShare_HistMemory0Len_HidStateLayer0_2L128_NoFreeze",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000019A95270108>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_NoShare_HistMemory0Len_HidStateLayer0_2L128_NoFreeze",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\use

  result = entry_point.load(False)


t=1200, 11.947043418884277s
t=1400, 12.029356956481934s
t=1600, 12.564987182617188s
t=1800, 13.224623918533325s
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -377 |
|          StdEpRet |              78 |
|          MaxEpRet |            -299 |
|          MinEpRet |            -455 |
|  AverageTestEpRet |            -356 |
|      StdTestEpRet |           0.803 |
|      MaxTestEpRet |            -354 |
|      MinTestEpRet |            -357 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.337 |
|         StdQ1Vals |           0.771 |
|         MaxQ1Vals |            3.76 |
|         MinQ1Vals |           -2.97 |
|     AverageQ2Vals |          -0.338 |
|         StdQ2Vals |           0.772 |
|         MaxQ2Vals |            3.77 |
|         MinQ2Vals |           -3.05 |
|            LossPi |          -0.119 |
|       

t=14000, 63.90969395637512s
t=14200, 12.93337345123291s
t=14400, 12.385907649993896s
t=14600, 11.932081460952759s
t=14800, 12.627410650253296s
t=15000, 12.274166584014893s
t=15200, 12.053757429122925s
t=15400, 12.321087121963501s
t=15600, 13.195796489715576s
t=15800, 12.991249561309814s
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |            -257 |
|          StdEpRet |            49.4 |
|          MaxEpRet |            -208 |
|          MinEpRet |            -306 |
|  AverageTestEpRet |            -487 |
|      StdTestEpRet |             224 |
|      MaxTestEpRet |            -157 |
|      MinTestEpRet |            -792 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|     AverageQ1Vals |         -0.0828 |
|         StdQ1Vals |            3.54 |
|         MaxQ1Vals |            15.3 |
|         MinQ1Vals |           -16.9 |
|     AverageQ2Vals |          -

t=28000, 65.23550391197205s
t=28200, 13.152977705001831s
t=28400, 13.150822401046753s
t=28600, 12.365923881530762s
t=28800, 14.598948240280151s
t=29000, 12.785765171051025s
t=29200, 13.258252620697021s
t=29400, 12.673100233078003s
t=29600, 11.93786358833313s
t=29800, 12.036062002182007s
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |             256 |
|          StdEpRet |            9.55 |
|          MaxEpRet |             266 |
|          MinEpRet |             247 |
|  AverageTestEpRet |             303 |
|      StdTestEpRet |            37.1 |
|      MaxTestEpRet |             351 |
|      MinTestEpRet |             232 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |            3.64 |
|         StdQ1Vals |            11.8 |
|         MaxQ1Vals |            46.9 |
|         MinQ1Vals |           -28.6 |
|     AverageQ2Vals |           

t=42000, 65.1042869091034s
t=42200, 13.528808355331421s
t=42400, 12.473634004592896s
t=42600, 12.381879091262817s
t=42800, 12.33899474143982s
t=43000, 12.122893333435059s
t=43200, 12.206349849700928s
t=43400, 12.29909896850586s
t=43600, 12.35594892501831s
t=43800, 12.202359437942505s
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |             567 |
|          StdEpRet |             155 |
|          MaxEpRet |             722 |
|          MinEpRet |             412 |
|  AverageTestEpRet |             800 |
|      StdTestEpRet |            20.1 |
|      MaxTestEpRet |             834 |
|      MinTestEpRet |             770 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |            20.1 |
|         StdQ1Vals |            19.6 |
|         MaxQ1Vals |             142 |
|         MinQ1Vals |           -31.7 |
|     AverageQ2Vals |            20

KeyboardInterrupt: 

In [402]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 10,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_NoShare_HistMemory_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_NoShare_HistMemory_HidStateLayer0_2L128_NoFreeze\lstm_td3_HalfCheetah_NoShare_HistMemory_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000019AA08B8E58>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_NoShare_HistMemory_HidStateLayer0_2L128_NoFreeze",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000019A9580F088>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_NoShare_HistMemory_HidStateLayer0_2L128_NoFreeze",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\go

  result = entry_point.load(False)


t=1200
t=1400
t=1600
t=1800
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -385 |
|          StdEpRet |            24.2 |
|          MaxEpRet |            -361 |
|          MinEpRet |            -410 |
|  AverageTestEpRet |            -738 |
|      StdTestEpRet |            9.13 |
|      MaxTestEpRet |            -726 |
|      MinTestEpRet |            -757 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.415 |
|         StdQ1Vals |           0.817 |
|         MaxQ1Vals |             3.7 |
|         MinQ1Vals |           -3.75 |
|     AverageQ2Vals |          -0.416 |
|         StdQ2Vals |           0.818 |
|         MaxQ2Vals |            3.67 |
|         MinQ2Vals |            -3.8 |
|            LossPi |          -0.136 |
|             LossQ |          0.0723 |
|              Time |            95.9 |
------------

t=16000
t=16200
t=16400
t=16600
t=16800
t=17000
t=17200
t=17400
t=17600
t=17800
---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |             777 |
|          StdEpRet |           0.381 |
|          MaxEpRet |             777 |
|          MinEpRet |             777 |
|  AverageTestEpRet |             590 |
|      StdTestEpRet |             154 |
|      MaxTestEpRet |             810 |
|      MinTestEpRet |             344 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.8e+04 |
|     AverageQ1Vals |            4.96 |
|         StdQ1Vals |            11.8 |
|         MaxQ1Vals |            39.8 |
|         MinQ1Vals |           -21.7 |
|     AverageQ2Vals |            4.96 |
|         StdQ2Vals |            11.8 |
|         MaxQ2Vals |            40.4 |
|         MinQ2Vals |           -21.8 |
|            LossPi |           -5.83 |
|             LossQ |            1.63 |


t=32000
t=32200
t=32400
t=32600
t=32800
t=33000
t=33200
t=33400
t=33600
t=33800
---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |             865 |
|          StdEpRet |            53.4 |
|          MaxEpRet |             918 |
|          MinEpRet |             812 |
|  AverageTestEpRet |             575 |
|      StdTestEpRet |             389 |
|      MaxTestEpRet |             853 |
|      MinTestEpRet |            -351 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.4e+04 |
|     AverageQ1Vals |            27.5 |
|         StdQ1Vals |            21.7 |
|         MaxQ1Vals |            66.2 |
|         MinQ1Vals |           -23.2 |
|     AverageQ2Vals |            27.5 |
|         StdQ2Vals |            21.7 |
|         MaxQ2Vals |            64.8 |
|         MinQ2Vals |           -23.4 |
|            LossPi |           -28.6 |
|             LossQ |               7 |


t=48000
t=48200
t=48400
t=48600
t=48800
t=49000
t=49200
t=49400
t=49600
t=49800
---------------------------------------
|             Epoch |              25 |
|      AverageEpRet |            64.7 |
|          StdEpRet |             514 |
|          MaxEpRet |             579 |
|          MinEpRet |            -449 |
|  AverageTestEpRet |            -311 |
|      StdTestEpRet |            97.6 |
|      MaxTestEpRet |            -177 |
|      MinTestEpRet |            -454 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           5e+04 |
|     AverageQ1Vals |            47.4 |
|         StdQ1Vals |            25.7 |
|         MaxQ1Vals |             141 |
|         MinQ1Vals |           -20.8 |
|     AverageQ2Vals |            47.4 |
|         StdQ2Vals |            25.7 |
|         MaxQ2Vals |             141 |
|         MinQ2Vals |           -20.2 |
|            LossPi |           -48.8 |
|             LossQ |            17.9 |


t=64000
t=64200
t=64400
t=64600
t=64800
t=65000
t=65200
t=65400
t=65600
t=65800
---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |             843 |
|          StdEpRet |            12.8 |
|          MaxEpRet |             856 |
|          MinEpRet |             830 |
|  AverageTestEpRet |             998 |
|      StdTestEpRet |            47.6 |
|      MaxTestEpRet |        1.07e+03 |
|      MinTestEpRet |             922 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.6e+04 |
|     AverageQ1Vals |            51.9 |
|         StdQ1Vals |            34.4 |
|         MaxQ1Vals |             145 |
|         MinQ1Vals |           -43.2 |
|     AverageQ2Vals |            51.9 |
|         StdQ2Vals |            34.4 |
|         MaxQ2Vals |             145 |
|         MinQ2Vals |           -42.3 |
|            LossPi |           -53.2 |
|             LossQ |            16.6 |


t=80000
t=80200
t=80400
t=80600
t=80800
t=81000
t=81200
t=81400
t=81600
t=81800
---------------------------------------
|             Epoch |              41 |
|      AverageEpRet |             884 |
|          StdEpRet |            33.8 |
|          MaxEpRet |             918 |
|          MinEpRet |             850 |
|  AverageTestEpRet |        1.01e+03 |
|      StdTestEpRet |            36.4 |
|      MaxTestEpRet |        1.06e+03 |
|      MinTestEpRet |             941 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         8.2e+04 |
|     AverageQ1Vals |            59.1 |
|         StdQ1Vals |            37.5 |
|         MaxQ1Vals |             121 |
|         MinQ1Vals |           -62.5 |
|     AverageQ2Vals |            59.1 |
|         StdQ2Vals |            37.5 |
|         MaxQ2Vals |             122 |
|         MinQ2Vals |             -62 |
|            LossPi |           -60.2 |
|             LossQ |            11.3 |


KeyboardInterrupt: 

In [384]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 0,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_NoShare_NoHistMemory_HidStateLayer0_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_NoShare_NoHistMemory_HidStateLayer0_2L128_NoFreeze\lstm_td3_HalfCheetah_NoShare_NoHistMemory_HidStateLayer0_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000019A9526DC18>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_NoShare_NoHistMemory_HidStateLayer0_2L128_NoFreeze",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000019A9527DF88>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_NoShare_NoHistMemory_HidStateLayer0_2L128_NoFreeze",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\ling

  result = entry_point.load(False)


t=1200
t=1400
t=1600
t=1800
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -426 |
|          StdEpRet |            94.4 |
|          MaxEpRet |            -331 |
|          MinEpRet |            -520 |
|  AverageTestEpRet |            -570 |
|      StdTestEpRet |            2.45 |
|      MaxTestEpRet |            -566 |
|      MinTestEpRet |            -574 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.332 |
|         StdQ1Vals |           0.788 |
|         MaxQ1Vals |            4.17 |
|         MinQ1Vals |           -2.73 |
|     AverageQ2Vals |          -0.332 |
|         StdQ2Vals |           0.791 |
|         MaxQ2Vals |            3.99 |
|         MinQ2Vals |           -2.75 |
|            LossPi |          -0.297 |
|             LossQ |           0.145 |
|              Time |            63.5 |
------------

t=16000
t=16200
t=16400
t=16600
t=16800
t=17000
t=17200
t=17400
t=17600
t=17800
---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |             475 |
|          StdEpRet |            39.7 |
|          MaxEpRet |             515 |
|          MinEpRet |             436 |
|  AverageTestEpRet |             541 |
|      StdTestEpRet |             136 |
|      MaxTestEpRet |             653 |
|      MinTestEpRet |             267 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.8e+04 |
|     AverageQ1Vals |           -1.87 |
|         StdQ1Vals |            3.56 |
|         MaxQ1Vals |            16.6 |
|         MinQ1Vals |           -16.8 |
|     AverageQ2Vals |           -1.87 |
|         StdQ2Vals |            3.57 |
|         MaxQ2Vals |            18.9 |
|         MinQ2Vals |           -17.5 |
|            LossPi |            1.09 |
|             LossQ |            1.48 |


t=32000
t=32200
t=32400
t=32600
t=32800
t=33000
t=33200
t=33400
t=33600
t=33800
---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |             802 |
|          StdEpRet |            41.6 |
|          MaxEpRet |             844 |
|          MinEpRet |             761 |
|  AverageTestEpRet |             751 |
|      StdTestEpRet |            17.1 |
|      MaxTestEpRet |             783 |
|      MinTestEpRet |             729 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.4e+04 |
|     AverageQ1Vals |            14.6 |
|         StdQ1Vals |            7.34 |
|         MaxQ1Vals |            26.8 |
|         MinQ1Vals |           -14.8 |
|     AverageQ2Vals |            14.6 |
|         StdQ2Vals |            7.34 |
|         MaxQ2Vals |            26.7 |
|         MinQ2Vals |           -16.5 |
|            LossPi |           -15.6 |
|             LossQ |            1.77 |


t=48000
t=48200
t=48400
t=48600
t=48800
t=49000
t=49200
t=49400
t=49600
t=49800
---------------------------------------
|             Epoch |              25 |
|      AverageEpRet |             841 |
|          StdEpRet |            25.2 |
|          MaxEpRet |             866 |
|          MinEpRet |             815 |
|  AverageTestEpRet |             970 |
|      StdTestEpRet |            40.2 |
|      MaxTestEpRet |        1.02e+03 |
|      MinTestEpRet |             906 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           5e+04 |
|     AverageQ1Vals |            30.7 |
|         StdQ1Vals |            6.48 |
|         MaxQ1Vals |            43.8 |
|         MinQ1Vals |          0.0232 |
|     AverageQ2Vals |            30.7 |
|         StdQ2Vals |            6.48 |
|         MaxQ2Vals |            43.3 |
|         MinQ2Vals |           -1.07 |
|            LossPi |           -31.6 |
|             LossQ |            2.25 |


KeyboardInterrupt: 

In [382]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 0,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Share_NoHistMemory_HidStateLayer64_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Share_NoHistMemory_HidStateLayer64_2L128_NoFreeze\lstm_td3_HalfCheetah_Share_NoHistMemory_HidStateLayer64_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000019A9E8C2B88>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Share_NoHistMemory_HidStateLayer64_2L128_NoFreeze",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000019A95971B08>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Share_NoHistMemory_HidStateLayer64_2L128_NoFreeze",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng

  result = entry_point.load(False)


t=1200
t=1400
t=1600
t=1800
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -295 |
|          StdEpRet |              47 |
|          MaxEpRet |            -248 |
|          MinEpRet |            -342 |
|  AverageTestEpRet |            -130 |
|      StdTestEpRet |            4.81 |
|      MaxTestEpRet |            -123 |
|      MinTestEpRet |            -136 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.765 |
|         StdQ1Vals |           0.722 |
|         MaxQ1Vals |            1.76 |
|         MinQ1Vals |           -5.32 |
|     AverageQ2Vals |          -0.763 |
|         StdQ2Vals |           0.724 |
|         MaxQ2Vals |            1.79 |
|         MinQ2Vals |           -5.36 |
|            LossPi |           0.262 |
|             LossQ |           0.181 |
|              Time |            78.5 |
------------

t=16000
t=16200
t=16400
t=16600
t=16800
t=17000
t=17200
t=17400
t=17600
t=17800
---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |             171 |
|          StdEpRet |            1.83 |
|          MaxEpRet |             172 |
|          MinEpRet |             169 |
|  AverageTestEpRet |             157 |
|      StdTestEpRet |            99.2 |
|      MaxTestEpRet |             333 |
|      MinTestEpRet |            10.9 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.8e+04 |
|     AverageQ1Vals |          -0.195 |
|         StdQ1Vals |            4.46 |
|         MaxQ1Vals |            16.5 |
|         MinQ1Vals |           -8.74 |
|     AverageQ2Vals |          -0.196 |
|         StdQ2Vals |            4.46 |
|         MaxQ2Vals |            15.7 |
|         MinQ2Vals |           -8.36 |
|            LossPi |          -0.246 |
|             LossQ |           0.795 |


t=32000
t=32200
t=32400
t=32600
t=32800
t=33000
t=33200
t=33400
t=33600
t=33800
---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |        1.07e+03 |
|          StdEpRet |            53.1 |
|          MaxEpRet |        1.13e+03 |
|          MinEpRet |        1.02e+03 |
|  AverageTestEpRet |        1.19e+03 |
|      StdTestEpRet |            27.6 |
|      MaxTestEpRet |        1.23e+03 |
|      MinTestEpRet |        1.15e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.4e+04 |
|     AverageQ1Vals |            18.3 |
|         StdQ1Vals |              15 |
|         MaxQ1Vals |            50.6 |
|         MinQ1Vals |           -3.54 |
|     AverageQ2Vals |            18.3 |
|         StdQ2Vals |              15 |
|         MaxQ2Vals |            50.1 |
|         MinQ2Vals |           -2.66 |
|            LossPi |           -18.8 |
|             LossQ |            3.02 |


t=48000
t=48200
t=48400
t=48600
t=48800
t=49000
t=49200
t=49400
t=49600
t=49800
---------------------------------------
|             Epoch |              25 |
|      AverageEpRet |        1.25e+03 |
|          StdEpRet |            2.89 |
|          MaxEpRet |        1.26e+03 |
|          MinEpRet |        1.25e+03 |
|  AverageTestEpRet |        1.02e+03 |
|      StdTestEpRet |            55.9 |
|      MaxTestEpRet |        1.14e+03 |
|      MinTestEpRet |             961 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           5e+04 |
|     AverageQ1Vals |            38.3 |
|         StdQ1Vals |            22.5 |
|         MaxQ1Vals |            77.1 |
|         MinQ1Vals |            1.36 |
|     AverageQ2Vals |            38.3 |
|         StdQ2Vals |            22.5 |
|         MaxQ2Vals |            77.6 |
|         MinQ2Vals |           -7.18 |
|            LossPi |           -38.9 |
|             LossQ |            6.44 |


t=64000
t=64200
t=64400
t=64600
t=64800
t=65000
t=65200
t=65400
t=65600
t=65800
---------------------------------------
|             Epoch |              33 |
|      AverageEpRet |         1.1e+03 |
|          StdEpRet |              63 |
|          MaxEpRet |        1.17e+03 |
|          MinEpRet |        1.04e+03 |
|  AverageTestEpRet |        1.15e+03 |
|      StdTestEpRet |            77.9 |
|      MaxTestEpRet |        1.29e+03 |
|      MinTestEpRet |             995 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.6e+04 |
|     AverageQ1Vals |            50.9 |
|         StdQ1Vals |            24.9 |
|         MaxQ1Vals |            86.6 |
|         MinQ1Vals |          0.0755 |
|     AverageQ2Vals |            50.9 |
|         StdQ2Vals |            24.9 |
|         MaxQ2Vals |            86.2 |
|         MinQ2Vals |            2.37 |
|            LossPi |           -51.3 |
|             LossQ |            6.05 |


KeyboardInterrupt: 

In [379]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 0,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_NotShare_NoHistMemory_HidStateLayer64_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_NotShare_NoHistMemory_HidStateLayer64_2L128_NoFreeze\lstm_td3_HalfCheetah_NotShare_NoHistMemory_HidStateLayer64_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000019A9ED3C318>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_NotShare_NoHistMemory_HidStateLayer64_2L128_NoFreeze",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000019AB5EA97C8>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_NotShare_NoHistMemory_HidStateLayer64_2L128_NoFreeze",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\use

t=10000
t=10200
t=10400
t=10600
t=10800
t=11000
t=11200
t=11400
t=11600
t=11800
---------------------------------------
|             Epoch |               6 |
|      AverageEpRet |            74.5 |
|          StdEpRet |            40.5 |
|          MaxEpRet |             115 |
|          MinEpRet |              34 |
|  AverageTestEpRet |            67.4 |
|      StdTestEpRet |            51.3 |
|      MaxTestEpRet |             196 |
|      MinTestEpRet |            9.97 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.2e+04 |
|     AverageQ1Vals |           -1.34 |
|         StdQ1Vals |            3.02 |
|         MaxQ1Vals |            14.3 |
|         MinQ1Vals |           -8.44 |
|     AverageQ2Vals |           -1.34 |
|         StdQ2Vals |            3.02 |
|         MaxQ2Vals |            14.3 |
|         MinQ2Vals |           -8.93 |
|            LossPi |           0.845 |
|             LossQ |           0.536 |


t=26000
t=26200
t=26400
t=26600
t=26800
t=27000
t=27200
t=27400
t=27600
t=27800
---------------------------------------
|             Epoch |              14 |
|      AverageEpRet |             425 |
|          StdEpRet |            47.4 |
|          MaxEpRet |             472 |
|          MinEpRet |             378 |
|  AverageTestEpRet |             501 |
|      StdTestEpRet |              38 |
|      MaxTestEpRet |             564 |
|      MinTestEpRet |             427 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.8e+04 |
|     AverageQ1Vals |            5.46 |
|         StdQ1Vals |            3.21 |
|         MaxQ1Vals |            16.9 |
|         MinQ1Vals |           -17.5 |
|     AverageQ2Vals |            5.46 |
|         StdQ2Vals |            3.21 |
|         MaxQ2Vals |            17.9 |
|         MinQ2Vals |           -17.7 |
|            LossPi |           -6.14 |
|             LossQ |            1.53 |


t=42000
t=42200
t=42400
t=42600
t=42800
t=43000
t=43200
t=43400
t=43600
t=43800
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |             715 |
|          StdEpRet |           0.464 |
|          MaxEpRet |             715 |
|          MinEpRet |             714 |
|  AverageTestEpRet |             677 |
|      StdTestEpRet |            37.8 |
|      MaxTestEpRet |             764 |
|      MinTestEpRet |             622 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |            17.7 |
|         StdQ1Vals |            5.38 |
|         MaxQ1Vals |            30.7 |
|         MinQ1Vals |             -21 |
|     AverageQ2Vals |            17.7 |
|         StdQ2Vals |            5.38 |
|         MaxQ2Vals |            31.6 |
|         MinQ2Vals |           -20.1 |
|            LossPi |           -18.4 |
|             LossQ |            2.36 |


t=58000
t=58200
t=58400
t=58600
t=58800
t=59000
t=59200
t=59400
t=59600
t=59800
---------------------------------------
|             Epoch |              30 |
|      AverageEpRet |             809 |
|          StdEpRet |              65 |
|          MaxEpRet |             874 |
|          MinEpRet |             744 |
|  AverageTestEpRet |             878 |
|      StdTestEpRet |            54.5 |
|      MaxTestEpRet |           1e+03 |
|      MinTestEpRet |             789 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           6e+04 |
|     AverageQ1Vals |              30 |
|         StdQ1Vals |            6.44 |
|         MaxQ1Vals |            43.9 |
|         MinQ1Vals |           -21.7 |
|     AverageQ2Vals |              30 |
|         StdQ2Vals |            6.44 |
|         MaxQ2Vals |            43.3 |
|         MinQ2Vals |           -18.4 |
|            LossPi |           -30.8 |
|             LossQ |            2.75 |


t=74000
t=74200
t=74400
t=74600
t=74800
t=75000
t=75200
t=75400
t=75600
t=75800
---------------------------------------
|             Epoch |              38 |
|      AverageEpRet |        1.03e+03 |
|          StdEpRet |            3.09 |
|          MaxEpRet |        1.03e+03 |
|          MinEpRet |        1.02e+03 |
|  AverageTestEpRet |        1.03e+03 |
|      StdTestEpRet |            14.2 |
|      MaxTestEpRet |        1.05e+03 |
|      MinTestEpRet |        1.01e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         7.6e+04 |
|     AverageQ1Vals |            42.6 |
|         StdQ1Vals |             7.3 |
|         MaxQ1Vals |            56.2 |
|         MinQ1Vals |           -4.42 |
|     AverageQ2Vals |            42.6 |
|         StdQ2Vals |            7.29 |
|         MaxQ2Vals |            56.2 |
|         MinQ2Vals |           -4.77 |
|            LossPi |           -43.3 |
|             LossQ |            2.99 |


KeyboardInterrupt: 

In [366]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 0,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer_2L128_NoFreeze_MLP'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer_2L128_NoFreeze_MLP\lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer_2L128_NoFreeze_MLP_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000019A95778DC8>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer_2L128_NoFreeze_MLP",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000019AD2010748>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer_2L128_NoFreeze_MLP",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "outpu

t=10000
t=10200
t=10400
t=10600
t=10800
t=11000
t=11200
t=11400
t=11600
t=11800
---------------------------------------
|             Epoch |               6 |
|      AverageEpRet |            -455 |
|          StdEpRet |            16.2 |
|          MaxEpRet |            -438 |
|          MinEpRet |            -471 |
|  AverageTestEpRet |            -309 |
|      StdTestEpRet |            31.7 |
|      MaxTestEpRet |            -259 |
|      MinTestEpRet |            -379 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.2e+04 |
|     AverageQ1Vals |           -4.96 |
|         StdQ1Vals |            6.76 |
|         MaxQ1Vals |            16.7 |
|         MinQ1Vals |           -26.1 |
|     AverageQ2Vals |           -4.96 |
|         StdQ2Vals |            6.76 |
|         MaxQ2Vals |              16 |
|         MinQ2Vals |           -25.1 |
|            LossPi |            4.29 |
|             LossQ |           0.949 |


t=26000
t=26200
t=26400
t=26600
t=26800
t=27000
t=27200
t=27400
t=27600
t=27800
---------------------------------------
|             Epoch |              14 |
|      AverageEpRet |        1.29e+03 |
|          StdEpRet |            21.3 |
|          MaxEpRet |        1.32e+03 |
|          MinEpRet |        1.27e+03 |
|  AverageTestEpRet |        1.64e+03 |
|      StdTestEpRet |             149 |
|      MaxTestEpRet |        1.78e+03 |
|      MinTestEpRet |         1.3e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.8e+04 |
|     AverageQ1Vals |           -2.32 |
|         StdQ1Vals |            16.6 |
|         MaxQ1Vals |            57.5 |
|         MinQ1Vals |             -31 |
|     AverageQ2Vals |           -2.32 |
|         StdQ2Vals |            16.6 |
|         MaxQ2Vals |            52.8 |
|         MinQ2Vals |           -30.7 |
|            LossPi |            1.42 |
|             LossQ |            3.58 |


t=42000
t=42200
t=42400
t=42600
t=42800
t=43000
t=43200
t=43400
t=43600
t=43800
---------------------------------------
|             Epoch |              22 |
|      AverageEpRet |        2.49e+03 |
|          StdEpRet |             938 |
|          MaxEpRet |        3.43e+03 |
|          MinEpRet |        1.55e+03 |
|  AverageTestEpRet |        3.66e+03 |
|      StdTestEpRet |             437 |
|      MaxTestEpRet |        3.95e+03 |
|      MinTestEpRet |        2.38e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.4e+04 |
|     AverageQ1Vals |            37.2 |
|         StdQ1Vals |            47.4 |
|         MaxQ1Vals |             145 |
|         MinQ1Vals |           -33.4 |
|     AverageQ2Vals |            37.2 |
|         StdQ2Vals |            47.4 |
|         MaxQ2Vals |             145 |
|         MinQ2Vals |           -33.6 |
|            LossPi |           -39.1 |
|             LossQ |            13.5 |


t=58000
t=58200
t=58400
t=58600
t=58800
t=59000
t=59200
t=59400
t=59600
t=59800
---------------------------------------
|             Epoch |              30 |
|      AverageEpRet |        4.06e+03 |
|          StdEpRet |             117 |
|          MaxEpRet |        4.18e+03 |
|          MinEpRet |        3.94e+03 |
|  AverageTestEpRet |        4.26e+03 |
|      StdTestEpRet |             118 |
|      MaxTestEpRet |        4.37e+03 |
|      MinTestEpRet |        3.94e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           6e+04 |
|     AverageQ1Vals |             102 |
|         StdQ1Vals |            80.7 |
|         MaxQ1Vals |             225 |
|         MinQ1Vals |           -49.7 |
|     AverageQ2Vals |             102 |
|         StdQ2Vals |            80.7 |
|         MaxQ2Vals |             223 |
|         MinQ2Vals |           -49.3 |
|            LossPi |            -105 |
|             LossQ |            21.1 |


t=74000
t=74200
t=74400
t=74600
t=74800
t=75000
t=75200
t=75400
t=75600
t=75800
---------------------------------------
|             Epoch |              38 |
|      AverageEpRet |        4.18e+03 |
|          StdEpRet |            14.4 |
|          MaxEpRet |        4.19e+03 |
|          MinEpRet |        4.16e+03 |
|  AverageTestEpRet |        4.47e+03 |
|      StdTestEpRet |              32 |
|      MaxTestEpRet |        4.53e+03 |
|      MinTestEpRet |        4.41e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         7.6e+04 |
|     AverageQ1Vals |             163 |
|         StdQ1Vals |            96.9 |
|         MaxQ1Vals |             271 |
|         MinQ1Vals |             -51 |
|     AverageQ2Vals |             163 |
|         StdQ2Vals |            96.9 |
|         MaxQ2Vals |             270 |
|         MinQ2Vals |           -50.6 |
|            LossPi |            -165 |
|             LossQ |              19 |


t=90000


KeyboardInterrupt: 

In [357]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 0,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer_2L128_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer_2L128_NoFreeze\lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer_2L128_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000019AD1F71288>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer_2L128_NoFreeze",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000019AF74B9448>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer_2L128_NoFreeze",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\use

  result = entry_point.load(False)


t=1200
t=1400
t=1600
t=1800
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -399 |
|          StdEpRet |            67.3 |
|          MaxEpRet |            -332 |
|          MinEpRet |            -467 |
|  AverageTestEpRet |            -415 |
|      StdTestEpRet |              20 |
|      MaxTestEpRet |            -391 |
|      MinTestEpRet |            -451 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.314 |
|         StdQ1Vals |           0.736 |
|         MaxQ1Vals |            4.71 |
|         MinQ1Vals |           -3.33 |
|     AverageQ2Vals |          -0.315 |
|         StdQ2Vals |           0.739 |
|         MaxQ2Vals |            4.45 |
|         MinQ2Vals |           -3.51 |
|            LossPi |          -0.241 |
|             LossQ |           0.149 |
|              Time |            82.5 |
------------

t=16000
t=16200
t=16400
t=16600
t=16800
t=17000
t=17200
t=17400
t=17600
t=17800
---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |             742 |
|          StdEpRet |            11.4 |
|          MaxEpRet |             753 |
|          MinEpRet |             731 |
|  AverageTestEpRet |             682 |
|      StdTestEpRet |             144 |
|      MaxTestEpRet |             785 |
|      MinTestEpRet |             275 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.8e+04 |
|     AverageQ1Vals |             3.9 |
|         StdQ1Vals |            5.64 |
|         MaxQ1Vals |            20.1 |
|         MinQ1Vals |           -33.6 |
|     AverageQ2Vals |             3.9 |
|         StdQ2Vals |            5.64 |
|         MaxQ2Vals |            21.9 |
|         MinQ2Vals |             -34 |
|            LossPi |           -4.71 |
|             LossQ |            1.68 |


KeyboardInterrupt: 

In [354]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 0,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer_2L128'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer_2L128\lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer_2L128_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000019AB9E3E9D8>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer_2L128",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000019AA073AF48>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer_2L128",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\google drive\\git_repo

  result = entry_point.load(False)


t=1200
t=1400
t=1600
t=1800
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -444 |
|          StdEpRet |            43.1 |
|          MaxEpRet |            -401 |
|          MinEpRet |            -488 |
|  AverageTestEpRet |            -492 |
|      StdTestEpRet |            1.84 |
|      MaxTestEpRet |            -489 |
|      MinTestEpRet |            -496 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.771 |
|         StdQ1Vals |           0.671 |
|         MaxQ1Vals |            3.08 |
|         MinQ1Vals |            -3.8 |
|     AverageQ2Vals |          -0.772 |
|         StdQ2Vals |           0.675 |
|         MaxQ2Vals |            3.09 |
|         MinQ2Vals |           -3.93 |
|            LossPi |           0.141 |
|             LossQ |           0.287 |
|              Time |            75.9 |
------------

t=16000
t=16200
t=16400
t=16600
t=16800
t=17000
t=17200
t=17400
t=17600
t=17800
---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |             443 |
|          StdEpRet |              28 |
|          MaxEpRet |             471 |
|          MinEpRet |             415 |
|  AverageTestEpRet |             451 |
|      StdTestEpRet |            18.7 |
|      MaxTestEpRet |             483 |
|      MinTestEpRet |             421 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.8e+04 |
|     AverageQ1Vals |            3.03 |
|         StdQ1Vals |            3.04 |
|         MaxQ1Vals |            13.2 |
|         MinQ1Vals |           -14.1 |
|     AverageQ2Vals |            3.03 |
|         StdQ2Vals |            3.04 |
|         MaxQ2Vals |            12.7 |
|         MinQ2Vals |           -13.7 |
|            LossPi |           -3.59 |
|             LossQ |            1.01 |


t=32000
t=32200
t=32400
t=32600
t=32800
t=33000
t=33200
t=33400
t=33600
t=33800
---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |             489 |
|          StdEpRet |             106 |
|          MaxEpRet |             595 |
|          MinEpRet |             384 |
|  AverageTestEpRet |             547 |
|      StdTestEpRet |            43.7 |
|      MaxTestEpRet |             606 |
|      MinTestEpRet |             451 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.4e+04 |
|     AverageQ1Vals |            12.6 |
|         StdQ1Vals |            4.46 |
|         MaxQ1Vals |            21.1 |
|         MinQ1Vals |           -13.5 |
|     AverageQ2Vals |            12.6 |
|         StdQ2Vals |            4.45 |
|         MaxQ2Vals |              21 |
|         MinQ2Vals |             -13 |
|            LossPi |           -13.1 |
|             LossQ |            1.49 |


KeyboardInterrupt: 

In [342]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 0,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer\lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000019AA51341F8>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000019A95258AC8>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_NotShare_NoHistMemory_NoHidStateLayer",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\google drive\\git_repos\\spinningup-new\\data\

t=10000
t=10200
t=10400
t=10600
t=10800
t=11000
t=11200
t=11400
t=11600
t=11800
---------------------------------------
|             Epoch |               6 |
|      AverageEpRet |            -457 |
|          StdEpRet |            93.9 |
|          MaxEpRet |            -363 |
|          MinEpRet |            -551 |
|  AverageTestEpRet |            -301 |
|      StdTestEpRet |            38.1 |
|      MaxTestEpRet |            -190 |
|      MinTestEpRet |            -332 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.2e+04 |
|     AverageQ1Vals |           -1.79 |
|         StdQ1Vals |            5.32 |
|         MaxQ1Vals |            18.7 |
|         MinQ1Vals |           -13.4 |
|     AverageQ2Vals |           -1.79 |
|         StdQ2Vals |            5.32 |
|         MaxQ2Vals |            19.1 |
|         MinQ2Vals |           -12.3 |
|            LossPi |            1.28 |
|             LossQ |            1.16 |


t=26000
t=26200
t=26400
t=26600
t=26800
t=27000
t=27200
t=27400
t=27600
t=27800
---------------------------------------
|             Epoch |              14 |
|      AverageEpRet |        2.15e+03 |
|          StdEpRet |             161 |
|          MaxEpRet |        2.31e+03 |
|          MinEpRet |        1.99e+03 |
|  AverageTestEpRet |             192 |
|      StdTestEpRet |             396 |
|      MaxTestEpRet |        1.21e+03 |
|      MinTestEpRet |            -126 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         2.8e+04 |
|     AverageQ1Vals |            5.46 |
|         StdQ1Vals |              22 |
|         MaxQ1Vals |            83.7 |
|         MinQ1Vals |             -22 |
|     AverageQ2Vals |            5.46 |
|         StdQ2Vals |            21.9 |
|         MaxQ2Vals |            83.8 |
|         MinQ2Vals |           -19.8 |
|            LossPi |           -6.36 |
|             LossQ |            5.18 |


KeyboardInterrupt: 

In [277]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'max_hist_len': 0,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_NotShare'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

td3(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     max_hist_len=args['max_hist_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_NotShare\lstm_td3_HalfCheetah_NotShare_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000019A956E6708>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_NotShare",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000019A95293088>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_NotShare",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\google drive\\git_repos\\spinningup-new\\data\\lstm_td3_HalfCheetah_NotShare\\lstm_td3_HalfCheetah_NotShare_s0",
            "output_file":	{
                "<_i

  result = entry_point.load(False)


t=1200
t=1400
t=1600
t=1800
---------------------------------------
|             Epoch |               1 |
|      AverageEpRet |            -354 |
|          StdEpRet |             114 |
|          MaxEpRet |            -240 |
|          MinEpRet |            -467 |
|  AverageTestEpRet |            -556 |
|      StdTestEpRet |             1.3 |
|      MaxTestEpRet |            -554 |
|      MinTestEpRet |            -559 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           2e+03 |
|     AverageQ1Vals |          -0.429 |
|         StdQ1Vals |           0.667 |
|         MaxQ1Vals |            3.25 |
|         MinQ1Vals |           -3.73 |
|     AverageQ2Vals |          -0.429 |
|         StdQ2Vals |           0.667 |
|         MaxQ2Vals |            3.38 |
|         MinQ2Vals |           -3.82 |
|            LossPi |          -0.108 |
|             LossQ |           0.146 |
|              Time |            87.4 |
------------

t=16000
t=16200
t=16400
t=16600
t=16800
t=17000
t=17200
t=17400
t=17600
t=17800
---------------------------------------
|             Epoch |               9 |
|      AverageEpRet |             605 |
|          StdEpRet |            84.8 |
|          MaxEpRet |             690 |
|          MinEpRet |             520 |
|  AverageTestEpRet |             693 |
|      StdTestEpRet |            78.5 |
|      MaxTestEpRet |             787 |
|      MinTestEpRet |             543 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.8e+04 |
|     AverageQ1Vals |            12.9 |
|         StdQ1Vals |            8.15 |
|         MaxQ1Vals |            32.4 |
|         MinQ1Vals |             -22 |
|     AverageQ2Vals |            12.9 |
|         StdQ2Vals |            8.16 |
|         MaxQ2Vals |            32.2 |
|         MinQ2Vals |           -23.5 |
|            LossPi |           -13.5 |
|             LossQ |            3.89 |


KeyboardInterrupt: 

In [None]:
list((1,))