In [1]:
from copy import deepcopy
import numpy as np
import torch
from torch.optim import Adam
import gym
import time
import spinup.algos.pytorch.lstm_ddpg.core as core
from spinup.utils.logx import EpochLogger
import itertools

In [37]:
class LSTMReplayBuffer:
    """
    A simple FIFO experience replay buffer for agents.
    """

    def __init__(self, obs_dim, act_dim, max_size):
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.max_size = max_size
        self.obs_buf = np.zeros(core.combined_shape(max_size, obs_dim), dtype=np.float32)
        self.obs2_buf = np.zeros(core.combined_shape(max_size, obs_dim), dtype=np.float32)
        self.act_buf = np.zeros(core.combined_shape(max_size, act_dim), dtype=np.float32)
        self.rew_buf = np.zeros(max_size, dtype=np.float32)
        self.done_buf = np.zeros(max_size, dtype=np.float32)
        self.ptr, self.size = 0, 0

    def store(self, obs, act, rew, next_obs, done):
        self.obs_buf[self.ptr] = obs
        self.act_buf[self.ptr] = act
        self.rew_buf[self.ptr] = rew
        self.obs2_buf[self.ptr] = list(next_obs)
        self.done_buf[self.ptr] = done
        self.ptr = (self.ptr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)

    def sample_batch(self, batch_size=32):
        idxs = np.random.randint(0, self.size, size=batch_size)
        batch = dict(obs=self.obs_buf[idxs],
                     obs2=self.obs2_buf[idxs],
                     act=self.act_buf[idxs],
                     rew=self.rew_buf[idxs],
                     done=self.done_buf[idxs])
        return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()}
    
    def sample_segment_batch(self, batch_size=32, segment_len=10):
        # Because we need segment_len backward experiences so sampling starts from segment_len to end of buffer.
        idxs = np.random.randint(segment_len, self.size, size=batch_size)
        seg_batch_obs = np.zeros([batch_size, segment_len, self.obs_dim])
        seg_batch_act = np.zeros([batch_size, segment_len, self.act_dim])
        seg_batch_obs2 = np.zeros([batch_size, segment_len, self.obs_dim])
        seg_batch_rew = np.zeros([batch_size, segment_len])
        seg_batch_done = np.zeros([batch_size, segment_len])
        seg_batch_len = segment_len * np.ones(batch_size)
        seg_batch_mask = np.ones([batch_size, segment_len]) # mask to ignore everything beyond segment
        # Look backward segment experiences
        for seg_i in range(segment_len):
            seg_batch_obs[:, -1-seg_i, :] = self.obs_buf[idxs-seg_i, :]
            seg_batch_act[:, -1-seg_i, :] = self.act_buf[idxs-seg_i, :]
            seg_batch_obs2[:, -1-seg_i, :] = self.obs2_buf[idxs-seg_i, :]
            seg_batch_rew[:, -1-seg_i] = self.rew_buf[idxs-seg_i]
            seg_batch_done[:, -1-seg_i] = self.done_buf[idxs-seg_i]
        
        # If there is done in the backward experiences, only consider the experiences after the last done.
        for batch_i in range(batch_size):
            done_idxs_exclude_last_exp = np.where(seg_batch_done[batch_i][:-1] == 1)  # Exclude last experience
            # If exist done
            if done_idxs_exclude_last_exp[0].size != 0:
                largest_done_id = done_idxs_exclude_last_exp[0][-1]
                seg_batch_len[batch_i] = segment_len - (largest_done_id+1)
                
                # Only keep experiences after the last done
                obs_keep_part = np.copy(seg_batch_obs[batch_i, largest_done_id+1:, :])
                act_keep_part = np.copy(seg_batch_act[batch_i, largest_done_id+1:, :])
                obs2_keep_part = np.copy(seg_batch_obs2[batch_i, largest_done_id+1:, :])
                rew_keep_part = np.copy(seg_batch_rew[batch_i, largest_done_id+1:])
                done_keep_part = np.copy(seg_batch_done[batch_i, largest_done_id+1:])

                # Set to 0 to make sure all experiences are at the beginning
                seg_batch_obs[batch_i] = np.zeros([segment_len, self.obs_dim])
                seg_batch_act[batch_i] = np.zeros([segment_len, self.act_dim])
                seg_batch_obs2[batch_i] = np.zeros([segment_len, self.obs_dim])
                seg_batch_rew[batch_i] = np.zeros([segment_len])
                seg_batch_done[batch_i] = np.zeros([segment_len])

                # Move kept experiences to the start of the segment
                seg_batch_obs[batch_i, :segment_len-(largest_done_id+1), :] = obs_keep_part
                seg_batch_act[batch_i, :segment_len-(largest_done_id+1), :] = act_keep_part
                seg_batch_obs2[batch_i, :segment_len-(largest_done_id+1), :] = obs2_keep_part
                seg_batch_rew[batch_i, :segment_len-(largest_done_id+1)] = rew_keep_part
                seg_batch_done[batch_i, :segment_len-(largest_done_id+1)] = done_keep_part
                
                seg_batch_mask[batch_i, segment_len-(largest_done_id+1):] = 0
        
        batch = dict(seg_obs=seg_batch_obs, 
                     seg_act=seg_batch_act, 
                     seg_obs2=seg_batch_obs2, 
                     seg_rew=seg_batch_rew, 
                     seg_done=seg_batch_done,
                     seg_len=seg_batch_len,
                     seg_mask=seg_batch_mask)
        return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in batch.items()}
    
    def sample_mixed_segment_batch(self, batch_size=32, max_segment_len=10):
        # Sample segment with max_segment_len
        seg_batch = self.sample_segment_batch(batch_size, max_segment_len)
        
        

### Test LSTMReplayBuffer

In [38]:
import gym

In [39]:
env = gym.make('HalfCheetah-v2')

In [40]:
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
act_limit = env.action_space.high[0]

In [41]:
lstm_bf = LSTMReplayBuffer(obs_dim, act_dim, int(1e6))

In [42]:
lstm_bf.size

0

In [43]:
#
total_steps = 10000
o, ep_len = env.reset(), 0

# Main loop: collect experience in env and update/log each epoch
for t in range(total_steps):
    a = env.action_space.sample()
    # Step the env
    o2, r, d, _ = env.step(a)
    ep_len += 1
    
    # Store experience to replay buffer
    lstm_bf.store(o, a, r, o2, d)
    
    o = o2
    
    # End of trajectory handling
    if d or (ep_len == 1000):
        o, ep_len = env.reset(), 0

In [44]:
seg_batch = lstm_bf.sample_segment_batch(segment_len=10)

In [45]:
seg_batch.keys()

dict_keys(['seg_obs', 'seg_act', 'seg_obs2', 'seg_rew', 'seg_done', 'seg_len', 'seg_mask'])

In [46]:
seg_batch['seg_len']

tensor([10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
        10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
        10., 10., 10., 10.])

In [47]:
seg_batch['seg_done']

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],


In [49]:
seg_batch['seg_mask'].shape

torch.Size([32, 10])

In [13]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [14]:
class MLPCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_sizes=(128, 128)):
        super(MLPCritic, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        
        self.layers = nn.ModuleList()
        layer_size = [obs_dim+act_dim]+list(hidden_sizes) + [1]
        for h in range(len(layer_size)-2):
            self.layers += [nn.Linear(layer_size[h], layer_size[h+1]), nn.ReLU()]
        self.layers += [nn.Linear(layer_size[-2], layer_size[-1]), nn.Identity()]
    
    def forward(self, obs, act):
        cat_input = torch.cat([obs, act], dim=-1)
        x = cat_input
        for layer in self.layers:
            x = layer(x)
        return torch.squeeze(x, -1) # Critical to ensure q has right shape.

class MLPActor(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit, hidden_sizes=(128, 128)):
        super(MLPActor, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.act_limit = act_limit
        
        self.layers = nn.ModuleList()
        layer_size = [obs_dim]+list(hidden_sizes) + [act_dim]
        for h in range(len(layer_size)-2):
            self.layers += [nn.Linear(layer_size[h], layer_size[h+1]), nn.ReLU()]
        self.layers += [nn.Linear(layer_size[-2], layer_size[-1]), nn.Tanh()]
    
    def forward(self, obs):
        x = obs
        for layer in self.layers:
            x = layer(x)
        return self.act_limit * x

class MLPActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit, hidden_sizes=(128, 128)):
        super(MLPActorCritic, self).__init__()
        self.q1 = MLPCritic(obs_dim, act_dim)
        self.q2 = MLPCritic(obs_dim, act_dim)
        self.pi = MLPActor(obs_dim, act_dim, act_limit=1)
    
    def act(self, obs):
        with torch.no_grad():
            return self.pi(obs).numpy() 
        

class LSTMCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, lstm_hidden_dim=128, lstm_hidden_num_layers=1, 
                 fc_hidden_sizes=(128,)):
        super(LSTMCritic, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        
        self.layers = nn.ModuleList()
        # LSTM layers
        self.lstm_hidden_dim = lstm_hidden_dim
        self.lstm_hidden_num_layers = lstm_hidden_num_layers
        self.layers += [nn.LSTM(obs_dim+act_dim, self.lstm_hidden_dim, self.lstm_hidden_num_layers, batch_first=True)]
        # Fully connected layers
        self.fc_hidden_sizes = [self.lstm_hidden_dim] + list(fc_hidden_sizes)
        for j in range(len(self.fc_hidden_sizes) - 1):
            self.layers += [nn.Linear(self.fc_hidden_sizes[j], self.fc_hidden_sizes[j + 1]), nn.ReLU()]
        # Output layer
        self.layers += [nn.Linear(self.fc_hidden_sizes[-1], 1), nn.Identity()]

    def forward(self, obs, act, seg_len=None, gather_last=True):
        # LSTM layers
        cat_input = torch.cat([obs, act], dim=-1)
        if seg_len is not None:
            input_packed = pack_padded_sequence(cat_input, lengths=seg_len,
                                                batch_first=True, enforce_sorted=False)
        else:
            input_packed = pack_padded_sequence(cat_input, lengths=[cat_input.size(1) for _ in range(cat_input.size(0))],
                                                batch_first=True, enforce_sorted=False)
        
        lstm_output_packed, (lstm_hidden_state, lstm_cell_state) = self.layers[0](input_packed)
        lstm_output_padded, lstm_output_lengths = pad_packed_sequence(lstm_output_packed, batch_first=True)
        x = lstm_output_padded
        for layer in self.layers[1:]:
            x = layer(x)
        # Only take Q(s_t, a_t)
        output = x
        if gather_last:
            return output.view(obs.shape[0], obs.shape[1]).gather(1, (seg_len-1).view(-1, 1).long())
        else:
            return output
#         return output

class LSTMActor(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit, lstm_hidden_dim=128, lstm_hidden_num_layers=1,
                 fc_hidden_sizes=(128,)):
        super(LSTMActor, self).__init__()
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.act_limit = act_limit
        
        self.layers = nn.ModuleList()
        # LSTM layers
        self.lstm_hidden_dim = lstm_hidden_dim
        self.lstm_hidden_num_layers = lstm_hidden_num_layers
        self.layers += [nn.LSTM(obs_dim, self.lstm_hidden_dim, self.lstm_hidden_num_layers, batch_first=True)]
        
        # Fully connected layers
        self.fc_hidden_sizes = [self.lstm_hidden_dim] + list(fc_hidden_sizes)
        for j in range(len(self.fc_hidden_sizes) - 1):
            self.layers += [nn.Linear(self.fc_hidden_sizes[j], self.fc_hidden_sizes[j + 1]), nn.ReLU()]
        # Output layer
        self.layers += [nn.Linear(self.fc_hidden_sizes[-1], self.act_dim), nn.Tanh()]
        
    def forward(self, obs, seg_len=None, gather_last=True):
        # LSTM layers
        if seg_len is not None:
            obs_packed = pack_padded_sequence(obs, lengths=seg_len, batch_first=True, enforce_sorted=False)
        else:
            obs_packed = pack_padded_sequence(obs, lengths=[obs.size(1) for _ in range(obs.size(0))], batch_first=True,
                                              enforce_sorted=False)
        lstm_output_packed, (lstm_hidden_state, lstm_cell_state) = self.layers[0](obs_packed)
        lstm_output_padded, lstm_output_lengths = pad_packed_sequence(lstm_output_packed, batch_first=True)
        x = lstm_output_padded
        for layer in self.layers[1:]:
            x = layer(x)
        # Return output from network scaled to action space limits.
        return self.act_limit * x

In [15]:
mlp_q = MLPCritic(obs_dim, act_dim)
mlp_a = MLPActor(obs_dim, act_dim, act_limit)
mlp_q.cuda()
mlp_a.cuda()
print(mlp_q)
print(mlp_a)

MLPCritic(
  (layers): ModuleList(
    (0): Linear(in_features=23, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=1, bias=True)
    (5): Identity()
  )
)
MLPActor(
  (layers): ModuleList(
    (0): Linear(in_features=17, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=6, bias=True)
    (5): Tanh()
  )
)


In [16]:
lstm_q = LSTMCritic(obs_dim, act_dim)
lstm_a = LSTMActor(obs_dim, act_dim, act_limit)
lstm_q.cuda()
lstm_a.cuda()
print(lstm_q)
print(lstm_a)

LSTMCritic(
  (layers): ModuleList(
    (0): LSTM(23, 128, batch_first=True)
    (1): Linear(in_features=128, out_features=128, bias=True)
    (2): ReLU()
    (3): Linear(in_features=128, out_features=1, bias=True)
    (4): Identity()
  )
)
LSTMActor(
  (layers): ModuleList(
    (0): LSTM(17, 128, batch_first=True)
    (1): Linear(in_features=128, out_features=128, bias=True)
    (2): ReLU()
    (3): Linear(in_features=128, out_features=6, bias=True)
    (4): Tanh()
  )
)


In [17]:
len(list(lstm_q.parameters()))

8

In [18]:
cuda = torch.device('cuda')

In [19]:
seg_obs = torch.rand([1,10, obs_dim]).to(device=cuda)
seg_len = torch.tensor([10.0]).to(device=cuda)
with torch.no_grad():
    a = lstm_a(seg_obs, seg_len)
a = torch.gather(a, 1, (seg_len-1).view(-1,1).repeat(1, act_dim).unsqueeze(1).type(torch.LongTensor).cuda()).squeeze()
a = a.cpu().numpy()
a += 0.1 * np.random.randn(act_dim)
a

array([-0.01414432,  0.0830159 , -0.08723574, -0.0658123 ,  0.01790731,
        0.06924526], dtype=float32)

#### Q-loss

In [20]:
(q2-1).view(-1,1).type(torch.LongTensor)

NameError: name 'q2' is not defined

In [None]:
q1 = lstm_q(seg_batch['seg_obs'].to(device=cuda), seg_batch['seg_act'].to(device=cuda), 
           seg_batch['seg_len'].to(device=cuda))
q1
q2 = lstm_q(seg_batch['seg_obs'].to(device=cuda), seg_batch['seg_act'].to(device=cuda), 
           seg_batch['seg_len'].to(device=cuda))
q2

In [21]:
q1 = lstm_q(seg_batch['seg_obs'].to(device=cuda), seg_batch['seg_act'].to(device=cuda), 
           seg_batch['seg_len'].to(device=cuda), gather_last=False)
q1.squeeze(2).shape

torch.Size([32, 10])

In [25]:
q1.shape

torch.Size([32, 10, 1])

In [23]:
q

NameError: name 'q' is not defined

In [182]:
l = seg_batch['seg_len'].to(device=cuda)

q = q1.squeeze(2)
q.shape
q[np.arange(32), (l-1).type(torch.LongTensor), :]

IndexError: too many indices for tensor of dimension 2

In [93]:
torch.min(q1, q2)

tensor([[ 5.5766e-02],
        [ 2.3664e-02],
        [ 3.3209e-02],
        [ 6.0861e-02],
        [ 1.7107e-03],
        [ 6.4048e-03],
        [ 2.8078e-02],
        [ 3.5067e-02],
        [ 4.2111e-02],
        [ 1.1461e-02],
        [ 4.0608e-02],
        [ 8.8063e-03],
        [ 6.0561e-02],
        [-1.5304e-03],
        [-6.4896e-03],
        [ 8.3145e-03],
        [ 3.5173e-02],
        [ 5.6848e-02],
        [ 2.2580e-05],
        [ 4.0550e-02],
        [-1.2118e-02],
        [ 1.4683e-02],
        [ 1.9199e-02],
        [ 1.0760e-02],
        [-2.1493e-02],
        [-9.3004e-04],
        [-1.2201e-02],
        [ 1.6077e-02],
        [ 2.6102e-02],
        [ 6.4398e-03],
        [ 4.6120e-02],
        [ 4.9644e-03]], device='cuda:0', grad_fn=<MinBackward2>)

In [187]:
q = lstm_q(seg_batch['seg_obs'].to(device=cuda), seg_batch['seg_act'].to(device=cuda), 
           seg_batch['seg_len'].to(device=cuda))

# Bellman backup for Q function
with torch.no_grad():
    gamma = 0.99
    batch_size = seg_batch['seg_act'].shape[0]
    seg_max_len = seg_batch['seg_act'].shape[1]

    seg_act2 = torch.zeros(seg_batch['seg_act'].shape).to(device=cuda)
    # Move forward
    seg_act2[:, :seg_max_len-1, :] = seg_batch['seg_act'][:, 1:, :]
    # Add predicted action
    pi_targ = lstm_a(seg_batch['seg_obs2'].to(device=cuda), seg_batch['seg_len'].to(device=cuda))
    seg_act2[np.arange(batch_size), (seg_batch['seg_len']-1).type(torch.LongTensor).to(device=cuda), :] = torch.gather(pi_targ, 1, (seg_batch['seg_len']-1).view(-1,1).repeat(1, act_dim).unsqueeze(1).type(torch.LongTensor).to(device=cuda)).squeeze()
    q_pi_targ = lstm_q(seg_batch['seg_obs2'].to(device=cuda), seg_act2.to(device=cuda), 
                       seg_batch['seg_len'].to(device=cuda))
    
    gathered_rew = torch.gather(seg_batch['seg_rew'], 1, (seg_batch['seg_len']-1).view(-1,1).type(torch.LongTensor)).to(device=cuda)
    gathered_done = torch.gather(seg_batch['seg_done'], 1, (seg_batch['seg_len']-1).view(-1,1).type(torch.LongTensor)).to(device=cuda)
    
    backup = gathered_rew + gamma * (1-gathered_done) * q_pi_targ
    
loss_q = ((q-backup)**2).mean()

#### Pi-loss

In [214]:
pi = lstm_a(seg_batch['seg_obs'].to(device=cuda), seg_batch['seg_len'].to(device=cuda))
q_pi = lstm_q(seg_batch['seg_obs'].to(device=cuda), pi, seg_batch['seg_len'].to(device=cuda))
loss_pi = -q_pi.mean()

In [215]:
loss_pi

tensor(0.0438, device='cuda:0', grad_fn=<NegBackward>)

In [212]:
# Combine past action and the predicted action
pi = lstm_a(seg_batch['seg_obs'].to(device=cuda), seg_batch['seg_len'].to(device=cuda))

batch_size = seg_batch['seg_act'].shape[0]
seg_max_len = seg_batch['seg_act'].shape[1]
seg_act = torch.zeros(seg_batch['seg_act'].shape).to(device=cuda)
# 
seg_act = seg_batch['seg_act'].to(device=cuda)
seg_act[np.arange(batch_size), (seg_batch['seg_len']-1).type(torch.LongTensor).to(device=cuda), :] = torch.gather(pi, 1, (seg_batch['seg_len']-1).view(-1,1).repeat(1, act_dim).unsqueeze(1).type(torch.LongTensor).to(device=cuda)).squeeze()

q_pi = lstm_q(seg_batch['seg_obs'].to(device=cuda), seg_act, seg_batch['seg_len'].to(device=cuda))
loss_pi = -q_pi.mean()

In [213]:
loss_pi

tensor(0.0439, device='cuda:0', grad_fn=<NegBackward>)

## Test

In [None]:
lstm_bf = LSTMReplayBuffer(obs_dim, act_dim, int(1e6))

In [357]:
deepcopy(lstm_a)

LSTMActor(
  (lstm): LSTM(111, 128, num_layers=2, batch_first=True)
)

In [358]:
logger = EpochLogger(**logger_kwargs)
logger.save_config(locals())

torch.manual_seed(seed)
np.random.seed(seed)

env, test_env = env_fn(), env_fn()
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape[0]

# Action limit for clamping: critically, assumes all dimensions share the same bound!
act_limit = env.action_space.high[0]

# Create actor-critic module and target networks
lstm_q = LSTMCritic(obs_dim, act_dim)
lstm_a = LSTMActor(obs_dim, act_dim, act_limit)
lstm_q_targ = deepcopy(lstm_q)
lstm_a_targ = deepcopy(lstm_a)


# Freeze target networks with respect to optimizers (only update via polyak averaging)
for p in lstm_q_targ.parameters():
    p.requires_grad = False
for p in lstm_a_targ.parameters():
    p.requires_grad = False

# Experience buffer
lstm_bf = LSTMReplayBuffer(obs_dim, act_dim, int(1e6))

# # Count variables (protip: try to get a feel for how different size networks behave!)
# var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q])
# logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n'%var_counts)

NameError: name 'logger_kwargs' is not defined

In [190]:
# Set up function for computing DDPG Q-loss
def compute_loss_q(seg_batch):
    q = lstm_q(seg_batch['seg_obs'], seg_batch['seg_act'], seg_batch['seg_len'])

    # Bellman backup for Q function
    with torch.no_grad():
        gamma = 0.99
        batch_size = seg_batch['seg_act'].shape[0]
        seg_max_len = seg_batch['seg_act'].shape[1]

        seg_act2 = torch.zeros(seg_batch['seg_act'].shape)
        # Move forward
        seg_act2[:, :seg_max_len-1, :] = seg_batch['seg_act'][:, 1:, :]
        # Add predicted action
        pi_targ = lstm_a(seg_batch['seg_obs2'], seg_batch['seg_len'])
        seg_act2[np.arange(batch_size), (seg_batch['seg_len']-1).type(torch.LongTensor), :] = torch.gather(pi_targ, 1, (seg_batch['seg_len']-1).view(-1,1).repeat(1, act_dim).unsqueeze(1).type(torch.LongTensor)).squeeze()
        q_pi_targ = lstm_q(seg_batch['seg_obs2'], seg_act2, seg_batch['seg_len'])

        gathered_rew = torch.gather(seg_batch['seg_rew'], 1, (seg_batch['seg_len']-1).view(-1,1).type(torch.LongTensor))
        gathered_done = torch.gather(seg_batch['seg_done'], 1, (seg_batch['seg_len']-1).view(-1,1).type(torch.LongTensor))

        backup = gathered_rew + gamma * (1-gathered_done) * q_pi_targ
    
    # MSE loss against Bellman backup
    loss_q = ((q-backup)**2).mean()
    # Useful info for logging
    loss_info = dict(QVals=q.squeeze().detach().numpy())
    
    return loss_q, loss_info

# Set up function for computing DDPG pi loss
def compute_loss_pi(seg_batch):
    pi = lstm_a(seg_batch['seg_obs'].to(device=cuda), seg_batch['seg_len'].to(device=cuda))
    q_pi = lstm_q(seg_batch['seg_obs'].to(device=cuda), pi, seg_batch['seg_len'].to(device=cuda))
    return -q_pi.mean()

# Set up optimizers for policy and q-function
pi_optimizer = Adam(lstm_a.parameters(), lr=pi_lr)
q_optimizer = Adam(lstm_q.parameters(), lr=q_lr)

# # Set up model saving
# logger.setup_pytorch_saver(ac)

def update(data):
    # First run one gradient descent step for Q.
    q_optimizer.zero_grad()
    loss_q, loss_info = compute_loss_q(data)
    loss_q.backward()
    q_optimizer.step()

    # Freeze Q-network so you don't waste computational effort 
    # computing gradients for it during the policy learning step.
    for p in lstm_q.parameters():
        p.requires_grad = False

    # Next run one gradient descent step for pi.
    pi_optimizer.zero_grad()
    loss_pi = compute_loss_pi(data)
    loss_pi.backward()
    pi_optimizer.step()

    # Unfreeze Q-network so you can optimize it at next DDPG step.
    for p in ac.q.parameters():
        p.requires_grad = True

    # Record things
    logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info)

    # Finally, update target networks by polyak averaging.
    with torch.no_grad():
        for p, p_targ in zip(lstm_a.parameters(), lstm_a_targ.parameters()):
            p_targ.data.mul_(polyak)
            p_targ.data.add_((1 - polyak) * p.data)
        for p, p_targ in zip(lstm_q.parameters(), lstm_q_targ.parameters()):
            p_targ.data.mul_(polyak)
            p_targ.data.add_((1 - polyak) * p.data)


def get_action(seg_o, seg_l, noise_scale):
    seg_obs = torch.tensor(seg_o).view(1, seg_o.shape[0], seg_o.shape[1]).to(device=cuda)
    seg_len = torch.tensor([seg_l]).to(device=cuda)
    with torch.no_grad():
        a = lstm_a(seg_obs, seg_len)
    a = torch.gather(a, 1, (seg_len-1).view(-1,1).repeat(1, act_dim).unsqueeze(1).type(torch.LongTensor).to(device=cuda)).squeeze()
    a = a.cpu().numpy()
    a += 0.1 * np.random.randn(act_dim)
    return np.clip(a, -act_limit, act_limit)

def test_agent():
    for j in range(num_test_episodes):
        o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
        while not(d or (ep_len == max_ep_len)):
            # Take deterministic actions at test time (noise_scale=0)
            o, r, d, _ = test_env.step(get_action(o, 0))
            ep_ret += r
            ep_len += 1
        logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

# Prepare for interaction with environment
total_steps = steps_per_epoch * epochs
start_time = time.time()
o, ep_ret, ep_len = env.reset(), 0, 0

# Main loop: collect experience in env and update/log each epoch
for t in range(total_steps):

    # Until start_steps have elapsed, randomly sample actions
    # from a uniform distribution for better exploration. Afterwards, 
    # use the learned policy (with some noise, via act_noise). 
    if t > start_steps:
        a = get_action(o, act_noise)
    else:
        a = env.action_space.sample()

    # Step the env
    o2, r, d, _ = env.step(a)
    ep_ret += r
    ep_len += 1

    # Ignore the "done" signal if it comes from hitting the time
    # horizon (that is, when it's an artificial terminal signal
    # that isn't based on the agent's state)
    d = False if ep_len==max_ep_len else d

    # Store experience to replay buffer
    lstm_bf.store(o, a, r, o2, d)

    # Super critical, easy to overlook step: make sure to update 
    # most recent observation!
    o = o2

    # End of trajectory handling
    if d or (ep_len == max_ep_len):
        logger.store(EpRet=ep_ret, EpLen=ep_len)
        o, ep_ret, ep_len = env.reset(), 0, 0

    # Update handling
    if t >= update_after and t % update_every == 0:
        for _ in range(update_every):
            segment_batch = lstm_bf.sample_segment_batch(batch_size)
            update(data=segment_batch)

    # End of epoch handling
    if (t+1) % steps_per_epoch == 0:
        epoch = (t+1) // steps_per_epoch

#         # Save model
#         if (epoch % save_freq == 0) or (epoch == epochs):
#             logger.save_state({'env': env}, None)

        # Test the performance of the deterministic version of the agent.
        test_agent()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('TestEpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('TestEpLen', average_only=True)
        logger.log_tabular('TotalEnvInteracts', t)
        logger.log_tabular('QVals', with_min_and_max=True)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossQ', average_only=True)
        logger.log_tabular('Time', time.time()-start_time)
        logger.dump_tabular()


NameError: name 'pi_lr' is not defined

### LSTM-DDPG

In [416]:

def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 
         steps_per_epoch=2000, epochs=100, replay_size=int(1e6), gamma=0.99, 
         polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=1000, 
         update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, 
         max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
    """
    Deep Deterministic Policy Gradient (DDPG)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, and a ``q`` module. The ``act`` method and
            ``pi`` module should accept batches of observations as inputs,
            and ``q`` should accept a batch of observations and a batch of 
            actions as inputs. When called, these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q``        (batch,)          | Tensor containing the current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    segment_len=10
    
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    lstm_q = LSTMCritic(obs_dim, act_dim)
    lstm_a = LSTMActor(obs_dim, act_dim, act_limit)
    lstm_q_targ = deepcopy(lstm_q)
    lstm_a_targ = deepcopy(lstm_a)
#     lstm_q.cuda()
#     lstm_a.cuda()
#     lstm_q_targ.cuda()
#     lstm_a_targ.cuda()


    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in lstm_q_targ.parameters():
        p.requires_grad = False
    for p in lstm_a_targ.parameters():
        p.requires_grad = False

    # Experience buffer
    lstm_bf = LSTMReplayBuffer(obs_dim, act_dim, int(1e6))
    
    # Set up function for computing DDPG Q-loss
    def compute_loss_q(seg_batch):
        q = lstm_q(seg_batch['seg_obs'].to(device=cuda), seg_batch['seg_act'].to(device=cuda), 
           seg_batch['seg_len'].to(device=cuda))

        # Bellman backup for Q function
        with torch.no_grad():
            batch_size = seg_batch['seg_act'].shape[0]
            seg_max_len = seg_batch['seg_act'].shape[1]

            seg_act2 = torch.zeros(seg_batch['seg_act'].shape).to(device=cuda)
            # Move forward
            seg_act2[:, :seg_max_len-1, :] = seg_batch['seg_act'][:, 1:, :]
            # Add predicted action
            pi_targ = lstm_a(seg_batch['seg_obs2'].to(device=cuda), seg_batch['seg_len'].to(device=cuda))
            seg_act2[np.arange(batch_size), (seg_batch['seg_len']-1).type(torch.LongTensor).to(device=cuda), :] = torch.gather(pi_targ, 1, (seg_batch['seg_len']-1).view(-1,1).repeat(1, act_dim).unsqueeze(1).type(torch.LongTensor).to(device=cuda)).squeeze()
            q_pi_targ = lstm_q(seg_batch['seg_obs2'].to(device=cuda), seg_act2.to(device=cuda), 
                               seg_batch['seg_len'].to(device=cuda))

            gathered_rew = torch.gather(seg_batch['seg_rew'], 1, (seg_batch['seg_len']-1).view(-1,1).type(torch.LongTensor)).to(device=cuda)
            gathered_done = torch.gather(seg_batch['seg_done'], 1, (seg_batch['seg_len']-1).view(-1,1).type(torch.LongTensor)).to(device=cuda)

            backup = gathered_rew + gamma * (1-gathered_done) * q_pi_targ

        # MSE loss against Bellman backup
        loss_q = ((q-backup)**2).mean()
        # Useful info for logging
        loss_info = dict(QVals=q.squeeze().detach().numpy())
        
        return loss_q, loss_info

    # Set up function for computing DDPG pi loss
    def compute_loss_pi(seg_batch):
        pi = lstm_a(seg_batch['seg_obs'], seg_batch['seg_len'])
        q_pi = lstm_q(seg_batch['seg_obs'], pi, seg_batch['seg_len'])
        return -q_pi.mean()

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(lstm_a.parameters(), lr=pi_lr)
    q_optimizer = Adam(lstm_q.parameters(), lr=q_lr)

    # # Set up model saving
    # logger.setup_pytorch_saver(ac)

    def update(data):
        # First run one gradient descent step for Q.
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Freeze Q-network so you don't waste computational effort 
        # computing gradients for it during the policy learning step.
        for p in lstm_q.parameters():
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-network so you can optimize it at next DDPG step.
        for p in lstm_q.parameters():
            p.requires_grad = True

        # Record things
        logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(lstm_a.parameters(), lstm_a_targ.parameters()):
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)
            for p, p_targ in zip(lstm_q.parameters(), lstm_q_targ.parameters()):
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)


    def get_action(seg_o, seg_l, noise_scale):
        seg_obs = torch.tensor(seg_o).view(1, seg_o.shape[0], seg_o.shape[1]).float()
        seg_len = torch.tensor([seg_l]).float()
        with torch.no_grad():
            a = lstm_a(seg_obs, seg_len)
        a = torch.gather(a, 1, (seg_len-1).view(-1,1).repeat(1, act_dim).unsqueeze(1).type(torch.LongTensor)).squeeze()
        a = a.numpy()
        a += 0.1 * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            o, ep_ret, ep_len = env.reset(), 0, 0
            o_buff = np.zeros([segment_len, obs_dim])
            o_buff[0,:] = o
            o_buff_len = 1
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o_buff, o_buff_len, 0))
                ep_ret += r
                ep_len += 1
                
                if o_buff_len == segment_len:
                    o_buff[:segment_len-1] = o_buff[1:]
                    o_buff[segment_len-1] = o
                else:
                    o_buff[o_buff_len+1-1] = o
                    o_buff_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
    o_buff = np.zeros([segment_len, obs_dim])
    o_buff[0,:] = o
    o_buff_len = 1
    
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        if t%200 == 0:
            print("t={}".format(t))
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards, 
        # use the learned policy (with some noise, via act_noise). 
        if t > start_steps:
            a = get_action(o_buff, o_buff_len, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        lstm_bf.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2
        
        
        if o_buff_len == segment_len:
            o_buff[:segment_len-1] = o_buff[1:]
            o_buff[segment_len-1] = o
        else:
            o_buff[o_buff_len+1-1] = o
            o_buff_len += 1
        

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0
            o_buff[0,:] = o
            o_buff_len = 1
    
        # Update handling
        if t >= update_after and t % update_every == 0:
            for _ in range(update_every):
#                 import pdb
#                 pdb.set_trace()
                segment_batch = lstm_bf.sample_segment_batch(batch_size, segment_len)
                update(data=segment_batch)

        # End of epoch handling
        if (t+1) % steps_per_epoch == 0:
            epoch = (t+1) // steps_per_epoch

    #         # Save model
    #         if (epoch % save_freq == 0) or (epoch == epochs):
    #             logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()




In [417]:
env = gym.make(args['env'])
env.observation_space

Box(17,)

In [None]:


args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 'gamma': 0.99, 'seed': 0, 'epochs': 50, 'exp_name': 'lstm_ddpg'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

ddpg(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_ddpg\lstm_ddpg_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x000001F421A91DC8>",
    "epochs":	50,
    "exp_name":	"lstm_ddpg",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000001F47F076048>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_ddpg",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\google drive\\git_repos\\spinningup-new\\data\\lstm_ddpg\\lstm_ddpg_s0",
            "output_file":	{
                "<_io.TextIOWrapper name='c:\\\\users\\\\lingheng\\\\google drive\\\\git_repos\\\\spinningup-new\\\\data\\\\lstm_ddpg\\\\lst

t=15200
t=15400
t=15600
t=15800
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |            -136 |
|          StdEpRet |            1.36 |
|          MaxEpRet |            -135 |
|          MinEpRet |            -138 |
|  AverageTestEpRet |            -126 |
|      StdTestEpRet |            2.56 |
|      MaxTestEpRet |            -122 |
|      MinTestEpRet |            -131 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|      AverageQVals |          -0.461 |
|          StdQVals |            1.13 |
|          MaxQVals |           0.705 |
|          MinQVals |           -1.76 |
|            LossPi |          -0.656 |
|             LossQ |          0.0332 |
|              Time |        6.19e+03 |
---------------------------------------
t=16000
t=16200
t=16400
t=16600
t=16800
t=17000
t=17200
t=17400
t=17600
t=17800
---------------------------------------
|       

t=32000
t=32200
t=32400
t=32600
t=32800
t=33000
t=33200
t=33400
t=33600
t=33800
---------------------------------------
|             Epoch |              17 |
|      AverageEpRet |           -95.4 |
|          StdEpRet |            3.21 |
|          MaxEpRet |           -92.1 |
|          MinEpRet |           -98.6 |
|  AverageTestEpRet |            -106 |
|      StdTestEpRet |            1.85 |
|      MaxTestEpRet |            -102 |
|      MinTestEpRet |            -108 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         3.4e+04 |
|      AverageQVals |           -1.14 |
|          StdQVals |            1.03 |
|          MaxQVals |           0.731 |
|          MinQVals |           -1.76 |
|            LossPi |          -0.703 |
|             LossQ |          0.0286 |
|              Time |        1.34e+04 |
---------------------------------------
t=34000
t=34200
t=34400
t=34600
t=34800
t=35000
t=35200
t=35400
t=35600
t=35800


In [None]:
from copy import deepcopy
import numpy as np
import torch
from torch.optim import Adam
import gym
import time
import spinup.algos.pytorch.lstm_ddpg.core as core
from spinup.utils.logx import EpochLogger

In [2]:
obs_dim = 5
act_dim = 10
act_limit = 1

In [3]:
lstm_actor = core.LSTMActor(obs_dim, act_dim, act_limit, lstm_hidden_dim=128, lstm_hidden_num_layers=2)

In [4]:
lstm_critic = core.LSTMCritic(obs_dim, act_dim, lstm_hidden_dim=128, lstm_hidden_num_layers=2)

In [5]:
batch_size = 64
max_seg_len = 10
obs = np.random.randn(batch_size, max_seg_len, obs_dim)
act = np.random.randn(batch_size, max_seg_len, act_dim)
seg_len = np.random.randint(1, 11, batch_size)

In [6]:
obs = torch.as_tensor(obs, dtype=torch.float32)
act = torch.as_tensor(act, dtype=torch.float32)

In [13]:
# out = lstm_actor(obs, seg_len)
out = lstm_actor(obs[:,0,:].view(-1, 1, obs_dim))

In [14]:
out.shape

torch.Size([64, 1, 10])

In [8]:
lstm_actor

LSTMActor(
  (lstm): LSTM(5, 128, num_layers=2, batch_first=True)
)

In [9]:
q_out = lstm_critic(obs, act, seg_len)

In [10]:
lstm_critic

LSTMQFunction(
  (lstm_layer): LSTM(15, 128, num_layers=2, batch_first=True)
)

In [11]:
q_out.shape

torch.Size([64, 10, 1])

In [17]:
seg_len-1

array([9, 1, 8, 7, 7, 7, 2, 7, 6, 6, 7, 3, 7, 5, 6, 3, 9, 4, 3, 5, 0, 2,
       0, 0, 1, 4, 9, 3, 7, 4, 6, 5, 4, 8, 2, 5, 1, 8, 8, 8, 2, 9, 0, 1,
       0, 4, 8, 2, 7, 9, 8, 8, 4, 2, 2, 2, 1, 3, 2, 1, 6, 3, 9, 0])

In [7]:
actor_critic = core.LSTMActorCritic(obs_dim, act_dim, act_limit)

In [9]:
actor_critic.act(obs).shape

(64, 10, 10)

In [6]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 'gamma': 0.99, 'seed': 0, 'epochs': 50, 'exp_name': 'lstm_ddpg'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])


env_fn = lambda : gym.make(args['env'])
actor_critic=core.LSTMActorCritic
ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l'])
seed=args['seed']
steps_per_epoch=4000
epochs=args['epochs']
replay_size=int(1e6)
gamma=args['gamma']
polyak=0.995
pi_lr=1e-3
q_lr=1e-3
batch_size=100
start_steps=10000
update_after=1000
update_every=50
act_noise=0.1
num_test_episodes=10
max_ep_len=1000
logger_kwargs=logger_kwargs
save_freq=1

In [5]:
logger = EpochLogger(**logger_kwargs)
# logger.save_config(globals())

torch.manual_seed(seed)
np.random.seed(seed)

env, test_env = env_fn(), env_fn()
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

# Action limit for clamping: critically, assumes all dimensions share the same bound!
act_limit = env.action_space.high[0]

# Create actor-critic module and target networks
ac = actor_critic(obs_dim, act_dim, act_limit)
ac_targ = deepcopy(ac)

# Freeze target networks with respect to optimizers (only update via polyak averaging)
for p in ac_targ.parameters():
    p.requires_grad = False

# Experience buffer
replay_buffer = LSTMReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, max_size=replay_size)

# Count variables (protip: try to get a feel for how different size networks behave!)
var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q])
print('\nNumber of parameters: \t pi: %d, \t q: %d\n'%var_counts)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_ddpg\lstm_ddpg_s0\progress.txt[0m

Number of parameters: 	 pi: 207360, 	 q: 210432



In [7]:
# Prepare for interaction with environment
total_steps = steps_per_epoch * epochs
start_time = time.time()
o, ep_ret, ep_len = env.reset(), 0, 0

# Main loop: collect experience in env and update/log each epoch
for t in range(start_steps):

    # Until start_steps have elapsed, randomly sample actions
    # from a uniform distribution for better exploration. Afterwards, 
    # use the learned policy (with some noise, via act_noise). 
    if t > start_steps:
        a = get_action(o, act_noise)
    else:
        a = env.action_space.sample()

    # Step the env
    o2, r, d, _ = env.step(a)
    ep_ret += r
    ep_len += 1

    # Ignore the "done" signal if it comes from hitting the time
    # horizon (that is, when it's an artificial terminal signal
    # that isn't based on the agent's state)
    d = False if ep_len==max_ep_len else d

    # Store experience to replay buffer
    replay_buffer.store(o, a, r, o2, d)

    # Super critical, easy to overlook step: make sure to update 
    # most recent observation!
    o = o2

    # End of trajectory handling
    if d or (ep_len == max_ep_len):
        logger.store(EpRet=ep_ret, EpLen=ep_len)
        o, ep_ret, ep_len = env.reset(), 0, 0

In [10]:
replay_buffer.sample_batch()

{'obs': tensor([[-3.3502e-02,  1.2444e+00, -1.5387e-01, -2.3853e-01,  3.1484e-02,
          -6.3777e-02,  2.6116e-01,  4.6654e-01,  7.5932e-01,  1.3283e-01,
          -1.2097e-01,  1.0458e+01,  2.7993e-01, -9.7420e-01, -2.7538e+00,
           2.3621e+00,  5.4977e+00],
         [-2.1792e-02,  1.1547e-01,  4.2179e-02, -4.8774e-01, -2.7805e-01,
           4.0558e-02,  1.4876e-01,  4.6678e-01, -3.6021e-01,  3.6723e-02,
          -2.6175e+00,  6.8446e+00, -2.5707e+00, -1.2023e+01, -4.0846e+00,
           6.7622e+00, -1.5436e+00],
         [-1.2233e-01,  1.6047e-02,  1.2211e-01,  7.0308e-02, -4.9576e-02,
           2.8593e-01, -1.1292e-01, -1.8526e-01, -1.5464e+00, -6.1043e-01,
           1.0202e+00, -8.4718e+00,  3.9568e+00,  4.9554e-01,  2.3046e+00,
          -1.2500e+01,  5.0741e+00],
         [-1.5468e-01,  1.0209e-01, -1.7416e-01,  6.1568e-02, -3.9428e-01,
          -3.8677e-01, -4.3272e-01,  1.2961e-01, -7.1051e-01,  1.7510e-01,
           2.8254e+00,  3.1098e+00, -1.0333e+01, -1.1056e

In [12]:
data = replay_buffer.sample_segment_batch()

In [13]:
data.keys()

dict_keys(['seg_obs', 'seg_act', 'seg_obs2', 'seg_rew', 'seg_done', 'seg_len'])

In [14]:
seg_o, seg_a, seg_r, seg_o2, seg_d, seg_len = data['seg_obs'], data['seg_act'], data['seg_rew'], data['seg_obs2'], data['seg_done'], data['seg_len']
q = ac.q(seg_o,seg_a, seg_len)

In [15]:
q.shape

torch.Size([32, 10, 1])

In [17]:
boots_q = ac_targ.q(seg_o2, ac_targ.pi(seg_o2, seg_len), seg_len)

In [28]:
backup.shape

torch.Size([32, 10, 1])

In [27]:
backup = seg_r.view(seg_r.size(0), seg_r.size(1), 1) + gamma * (1-seg_d.view(seg_d.size(0), seg_d.size(1), 1)) * boots_q

In [10]:
# Set up function for computing DDPG Q-loss
def compute_loss_q(data):
    o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']

    q = ac.q(o,a)

    # Bellman backup for Q function
    with torch.no_grad():
        q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2))
        backup = r + gamma * (1 - d) * q_pi_targ

    # MSE loss against Bellman backup
    loss_q = ((q - backup)**2).mean()

    # Useful info for logging
    loss_info = dict(QVals=q.detach().numpy())

    return loss_q, loss_info

# Set up function for computing DDPG pi loss
def compute_loss_pi(data):
    o = data['obs']
    q_pi = ac.q(o, ac.pi(o))
    return -q_pi.mean()

# Set up optimizers for policy and q-function
pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
q_optimizer = Adam(ac.q.parameters(), lr=q_lr)

# Set up model saving
logger.setup_pytorch_saver(ac)

def update(data):
    # First run one gradient descent step for Q.
    q_optimizer.zero_grad()
    loss_q, loss_info = compute_loss_q(data)
    loss_q.backward()
    q_optimizer.step()

    # Freeze Q-network so you don't waste computational effort 
    # computing gradients for it during the policy learning step.
    for p in ac.q.parameters():
        p.requires_grad = False

    # Next run one gradient descent step for pi.
    pi_optimizer.zero_grad()
    loss_pi = compute_loss_pi(data)
    loss_pi.backward()
    pi_optimizer.step()

    # Unfreeze Q-network so you can optimize it at next DDPG step.
    for p in ac.q.parameters():
        p.requires_grad = True

    # Record things
    logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info)

    # Finally, update target networks by polyak averaging.
    with torch.no_grad():
        for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
            # NB: We use an in-place operations "mul_", "add_" to update target
            # params, as opposed to "mul" and "add", which would make new tensors.
            p_targ.data.mul_(polyak)
            p_targ.data.add_((1 - polyak) * p.data)

def get_action(o, noise_scale):
    a = ac.act(torch.as_tensor(o, dtype=torch.float32))
    a += noise_scale * np.random.randn(act_dim)
    return np.clip(a, -act_limit, act_limit)

def test_agent():
    for j in range(num_test_episodes):
        o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
        while not(d or (ep_len == max_ep_len)):
            # Take deterministic actions at test time (noise_scale=0)
            o, r, d, _ = test_env.step(get_action(o, 0))
            ep_ret += r
            ep_len += 1
        logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

# Prepare for interaction with environment
total_steps = steps_per_epoch * epochs
start_time = time.time()
o, ep_ret, ep_len = env.reset(), 0, 0

# Main loop: collect experience in env and update/log each epoch
for t in range(total_steps):

    # Until start_steps have elapsed, randomly sample actions
    # from a uniform distribution for better exploration. Afterwards, 
    # use the learned policy (with some noise, via act_noise). 
    if t > start_steps:
        a = get_action(o, act_noise)
    else:
        a = env.action_space.sample()

    # Step the env
    o2, r, d, _ = env.step(a)
    ep_ret += r
    ep_len += 1

    # Ignore the "done" signal if it comes from hitting the time
    # horizon (that is, when it's an artificial terminal signal
    # that isn't based on the agent's state)
    d = False if ep_len==max_ep_len else d

    # Store experience to replay buffer
    replay_buffer.store(o, a, r, o2, d)

    # Super critical, easy to overlook step: make sure to update 
    # most recent observation!
    o = o2

    # End of trajectory handling
    if d or (ep_len == max_ep_len):
        logger.store(EpRet=ep_ret, EpLen=ep_len)
        o, ep_ret, ep_len = env.reset(), 0, 0

    # Update handling
    if t >= update_after and t % update_every == 0:
        for _ in range(update_every):
            batch = replay_buffer.sample_batch(batch_size)
            update(data=batch)

    # End of epoch handling
    if (t+1) % steps_per_epoch == 0:
        epoch = (t+1) // steps_per_epoch

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs):
            logger.save_state({'env': env}, None)

        # Test the performance of the deterministic version of the agent.
        test_agent()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('TestEpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('TestEpLen', average_only=True)
        logger.log_tabular('TotalEnvInteracts', t)
        logger.log_tabular('QVals', with_min_and_max=True)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossQ', average_only=True)
        logger.log_tabular('Time', time.time()-start_time)
        logger.dump_tabular()


[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_ddpg\lstm_ddpg_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"LSTMActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000018876D7FCA8>",
    "epochs":	50,
    "exp_name":	"lstm_ddpg",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x00000188766C8A08>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_ddpg",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\google drive\\git_repos\\spinningup-new\\data\\lstm_ddpg\\lstm_ddpg_s0",
            "output_file":	{
                "<_io.TextIOWrapper name='c:\\\\users\\\\lingheng\\\\google drive\\\\git_repos\\\\spinningup-new\\\\data\\\\lstm_ddpg\\\\ls

RuntimeError: input must have 2 dimensions, got 1

In [9]:
env = gym.make(args['env'])
env.observation_space.shape[0]

17

# Test GPU

In [220]:

def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 
         steps_per_epoch=2000, epochs=100, replay_size=int(1e6), gamma=0.99, 
         polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=1000, 
         update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=5, 
         max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
    """
    Deep Deterministic Policy Gradient (DDPG)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, and a ``q`` module. The ``act`` method and
            ``pi`` module should accept batches of observations as inputs,
            and ``q`` should accept a batch of observations and a batch of 
            actions as inputs. When called, these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q``        (batch,)          | Tensor containing the current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    segment_len=10
    
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    lstm_q = LSTMCritic(obs_dim, act_dim)
    lstm_a = LSTMActor(obs_dim, act_dim, act_limit)
    lstm_q_targ = deepcopy(lstm_q)
    lstm_a_targ = deepcopy(lstm_a)
    lstm_q.cuda()
    lstm_a.cuda()
    lstm_q_targ.cuda()
    lstm_a_targ.cuda()


    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in lstm_q_targ.parameters():
        p.requires_grad = False
    for p in lstm_a_targ.parameters():
        p.requires_grad = False

    # Experience buffer
    lstm_bf = LSTMReplayBuffer(obs_dim, act_dim, int(1e6))
    
    # Set up function for computing DDPG Q-loss
    def compute_loss_q(seg_batch):
        q = lstm_q(seg_batch['seg_obs'].to(device=cuda), seg_batch['seg_act'].to(device=cuda), 
                   seg_batch['seg_len'].to(device=cuda))

        # Bellman backup for Q function
        with torch.no_grad():
            batch_size = seg_batch['seg_act'].shape[0]
            seg_max_len = seg_batch['seg_act'].shape[1]

            seg_act2 = torch.zeros(seg_batch['seg_act'].shape).to(device=cuda)
            # Move forward
            seg_act2[:, :seg_max_len-1, :] = seg_batch['seg_act'][:, 1:, :]
            # Add predicted action
            pi_targ = lstm_a_targ(seg_batch['seg_obs2'].to(device=cuda), seg_batch['seg_len'].to(device=cuda))
            seg_act2[np.arange(batch_size), (seg_batch['seg_len']-1).type(torch.LongTensor).to(device=cuda), :] = torch.gather(pi_targ, 1, (seg_batch['seg_len']-1).view(-1,1).repeat(1, act_dim).unsqueeze(1).type(torch.LongTensor).to(device=cuda)).squeeze()
            q_pi_targ = lstm_q_targ(seg_batch['seg_obs2'].to(device=cuda), 
                                    seg_act2.to(device=cuda), 
                                    seg_batch['seg_len'].to(device=cuda))

            gathered_rew = torch.gather(seg_batch['seg_rew'], 1, (seg_batch['seg_len']-1).view(-1,1).type(torch.LongTensor).to(device=cuda))
            gathered_done = torch.gather(seg_batch['seg_done'], 1, (seg_batch['seg_len']-1).view(-1,1).type(torch.LongTensor).to(device=cuda))

            backup = gathered_rew + gamma * (1-gathered_done) * q_pi_targ

        # MSE loss against Bellman backup
        loss_q = ((q-backup)**2).mean()
        # Useful info for logging
        loss_info = dict(QVals=q.squeeze().detach().cpu().numpy())
        
        return loss_q, loss_info

    # Set up function for computing DDPG pi loss
    def compute_loss_pi(seg_batch):
        # 1. 
        pi = lstm_a(seg_batch['seg_obs'].cuda(), seg_batch['seg_len'].cuda())
        q_pi = lstm_q(seg_batch['seg_obs'].cuda(), pi, seg_batch['seg_len'].cuda())
#         # 2.
#         # Combine past action and the predicted action
#         pi = lstm_a(seg_batch['seg_obs'].to(device=cuda), seg_batch['seg_len'].to(device=cuda))

#         batch_size = seg_batch['seg_act'].shape[0]
#         seg_max_len = seg_batch['seg_act'].shape[1]
#         seg_act = torch.zeros(seg_batch['seg_act'].shape).to(device=cuda)
#         # 
#         seg_act = seg_batch['seg_act'].to(device=cuda)
#         seg_act[np.arange(batch_size), (seg_batch['seg_len']-1).type(torch.LongTensor).to(device=cuda), :] = torch.gather(pi, 1, (seg_batch['seg_len']-1).view(-1,1).repeat(1, act_dim).unsqueeze(1).type(torch.LongTensor).to(device=cuda)).squeeze()

#         q_pi = lstm_q(seg_batch['seg_obs'].to(device=cuda), seg_act, seg_batch['seg_len'].to(device=cuda))
        return -q_pi.mean()

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(lstm_a.parameters(), lr=pi_lr)
    q_optimizer = Adam(lstm_q.parameters(), lr=q_lr)

    # # Set up model saving
    # logger.setup_pytorch_saver(ac)

    def update(data):
        # First run one gradient descent step for Q.
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Freeze Q-network so you don't waste computational effort 
        # computing gradients for it during the policy learning step.
        for p in lstm_q.parameters():
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-network so you can optimize it at next DDPG step.
        for p in lstm_q.parameters():
            p.requires_grad = True

        # Record things
        logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(lstm_a.parameters(), lstm_a_targ.parameters()):
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)
            for p, p_targ in zip(lstm_q.parameters(), lstm_q_targ.parameters()):
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)


    def get_action(seg_o, seg_l, noise_scale):
        seg_obs = torch.tensor(seg_o).view(1, seg_o.shape[0], seg_o.shape[1]).float().to(device=cuda)
        seg_len = torch.tensor([seg_l]).float().to(device=cuda)
        with torch.no_grad():
            a = lstm_a(seg_obs, seg_len)
        a = torch.gather(a, 1, (seg_len-1).view(-1,1).repeat(1, act_dim).unsqueeze(1).type(torch.LongTensor).to(device=cuda)).squeeze()
        a = a.cpu().numpy()
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            o, ep_ret, ep_len = env.reset(), 0, 0
            o_buff = np.zeros([segment_len, obs_dim])
            o_buff[0,:] = o
            o_buff_len = 1
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o_buff, o_buff_len, 0))
                ep_ret += r
                ep_len += 1
                
                if o_buff_len == segment_len:
                    o_buff[:segment_len-1] = o_buff[1:]
                    o_buff[segment_len-1] = o
                else:
                    o_buff[o_buff_len+1-1] = o
                    o_buff_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
    o_buff = np.zeros([segment_len, obs_dim])
    o_buff[0,:] = o
    o_buff_len = 1
    
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        if t%200 == 0:
            print("t={}".format(t))
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards, 
        # use the learned policy (with some noise, via act_noise). 
        if t > start_steps:
            a = get_action(o_buff, o_buff_len, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        lstm_bf.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2
        
        
        if o_buff_len == segment_len:
            o_buff[:segment_len-1] = o_buff[1:]
            o_buff[segment_len-1] = o
        else:
            o_buff[o_buff_len+1-1] = o
            o_buff_len += 1
        

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0
            o_buff[0,:] = o
            o_buff_len = 1
    
        # Update handling
        if t >= update_after and t % update_every == 0:
            for _ in range(update_every):
                segment_batch = lstm_bf.sample_segment_batch(batch_size, segment_len)
                segment_batch['seg_obs'] = segment_batch['seg_obs'].cuda()
                segment_batch['seg_act'] = segment_batch['seg_act'].cuda()
                segment_batch['seg_rew'] = segment_batch['seg_rew'].cuda()
                segment_batch['seg_obs2'] = segment_batch['seg_obs2'].cuda()
                segment_batch['seg_done'] = segment_batch['seg_done'].cuda()
                segment_batch['seg_len'] = segment_batch['seg_len'].cuda()
                update(data=segment_batch)

        # End of epoch handling
        if (t+1) % steps_per_epoch == 0:
            epoch = (t+1) // steps_per_epoch

    #         # Save model
    #         if (epoch % save_freq == 0) or (epoch == epochs):
    #             logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()




In [221]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 'gamma': 0.99, 'seed': 0, 'epochs': 50, 'exp_name': 'lstm_ddpg_HalfCheetah_Small_Net'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

ddpg(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_ddpg_HalfCheetah_Small_Net\lstm_ddpg_HalfCheetah_Small_Net_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000021AE0E13048>",
    "epochs":	50,
    "exp_name":	"lstm_ddpg_HalfCheetah_Small_Net",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000021AC81BCD88>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_ddpg_HalfCheetah_Small_Net",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\google drive\\git_repos\\spinningup-new\\data\\lstm_ddpg_HalfCheetah_Small_Net\\lstm_ddpg_HalfCheetah_Small_Net_s0",
            "output_file":	{
        

t=14000
t=14200
t=14400
t=14600
t=14800
t=15000
t=15200
t=15400
t=15600
t=15800
---------------------------------------
|             Epoch |               8 |
|      AverageEpRet |            -469 |
|          StdEpRet |             302 |
|          MaxEpRet |            -167 |
|          MinEpRet |            -771 |
|  AverageTestEpRet |            -674 |
|      StdTestEpRet |             291 |
|      MaxTestEpRet |           -99.4 |
|      MinTestEpRet |            -884 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.6e+04 |
|      AverageQVals |            10.6 |
|          StdQVals |            45.1 |
|          MaxQVals |             140 |
|          MinQVals |           -55.6 |
|            LossPi |           -13.5 |
|             LossQ |            4.76 |
|              Time |             670 |
---------------------------------------
t=16000
t=16200
t=16400
t=16600
t=16800
t=17000
t=17200
t=17400
t=17600
t=17800


KeyboardInterrupt: 

## LSTM-TD3

In [116]:

def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 
         steps_per_epoch=2000, epochs=100, replay_size=int(1e6), 
         segment_len=10,
         gamma=0.99, 
         polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=1000, 
         update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=5, 
         max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
    """
    Deep Deterministic Policy Gradient (DDPG)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, and a ``q`` module. The ``act`` method and
            ``pi`` module should accept batches of observations as inputs,
            and ``q`` should accept a batch of observations and a batch of 
            actions as inputs. When called, these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q``        (batch,)          | Tensor containing the current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    
    
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    lstm_q1 = LSTMCritic(obs_dim, act_dim)
    lstm_q2 = LSTMCritic(obs_dim, act_dim)
    lstm_a = LSTMActor(obs_dim, act_dim, act_limit)
    lstm_q1_targ = deepcopy(lstm_q1)
    lstm_q2_targ = deepcopy(lstm_q2)
    lstm_a_targ = deepcopy(lstm_a)
    lstm_q1.cuda()
    lstm_q2.cuda()
    lstm_a.cuda()
    lstm_q1_targ.cuda()
    lstm_q2_targ.cuda()
    lstm_a_targ.cuda()


    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in lstm_q1_targ.parameters():
        p.requires_grad = False
    for p in lstm_q2_targ.parameters():
        p.requires_grad = False
    for p in lstm_a_targ.parameters():
        p.requires_grad = False

    # Experience buffer
    lstm_bf = LSTMReplayBuffer(obs_dim, act_dim, int(1e6))
    
    # Set up function for computing DDPG Q-loss
    def compute_loss_q(seg_batch):
        o = seg_batch['seg_obs'].to(device=cuda)
        a = seg_batch['seg_act'].to(device=cuda)
        r = seg_batch['seg_rew'].to(device=cuda)
        o2 = seg_batch['seg_obs2'].to(device=cuda)
        d = seg_batch['seg_done'].to(device=cuda)
        l = seg_batch['seg_len'].to(device=cuda)
        m = seg_batch['seg_mask'].to(device=cuda)
        
#         q1 = lstm_q1(o, a, l)
#         q2 = lstm_q2(o, a, l)

        # Bellman backup for Q function
        with torch.no_grad():
#             batch_size = a.shape[0]
#             seg_max_len = a.shape[1]
            
#             seg_act2 = torch.zeros(a.shape).to(device=cuda)
#             # Move forward
#             seg_act2[:, :seg_max_len-1, :] = a[:, 1:, :]
#             pi_targ = lstm_a_targ(o2, l)
# #             # 1. Add predicted action
# #             seg_act2[np.arange(batch_size), (seg_batch['seg_len']-1).type(torch.LongTensor).to(device=cuda), :] = torch.gather(pi_targ, 1, (seg_batch['seg_len']-1).view(-1,1).repeat(1, act_dim).unsqueeze(1).type(torch.LongTensor).to(device=cuda)).squeeze(1)
#             # 2. Directely use predicted action (better than combined)
#             seg_act2 = pi_targ
            
#             q1_pi_targ = lstm_q1_targ(o2, seg_act2, l)
#             q2_pi_targ = lstm_q2_targ(o2, seg_act2, l)
            
#             gathered_rew = torch.gather(r, 1, (l-1).view(-1,1).type(torch.LongTensor).to(device=cuda))
#             gathered_done = torch.gather(d, 1, (l-1).view(-1,1).type(torch.LongTensor).to(device=cuda))
            
#             q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
#             backup = gathered_rew + gamma * (1-gathered_done) * q_pi_targ
            
            q_loss_type = "comb_pred_targ_pi"
            if q_loss_type == "comb_pred_targ_pi":
                #
                pi_targ = torch.zeros(a.shape).to(device=cuda)
                pi_targ[:, :a.shape[1]-1, :] = a[:, 1:, :]   # Move forward
                pi_targ[np.arange(a.shape[0]), (l-1).type(torch.LongTensor).to(device=cuda), :] = torch.gather(lstm_a_targ(o2, l), 1, (l-1).view(-1,1).repeat(1, act_dim).unsqueeze(1).type(torch.LongTensor).to(device=cuda)).squeeze(1)
            elif q_loss_type == "full_pred_targ_pi":
                # 3.Full prediction
                pi_targ = lstm_a_targ(o2, l)
            
            q1_pi_targ = lstm_q1_targ(o2, pi_targ, l, gather_last=False)
            q2_pi_targ = lstm_q2_targ(o2, pi_targ, l, gather_last=False)
            
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r+((1-d)*q_pi_targ.squeeze(2))
            
        q1 = lstm_q1(o, a, l, gather_last=False).squeeze(2)
        q2 = lstm_q2(o, a, l, gather_last=False).squeeze(2)
        
        # MSE loss against Bellman backup
        loss_q1 = ((m*(q1 - backup))**2).mean()
        loss_q2 = ((m*(q2 - backup))**2).mean()
        loss_q = loss_q1 + loss_q2
#         import pdb
#         pdb.set_trace()
        # Useful info for logging
        loss_info = dict(Q1Vals=q1.squeeze().detach().cpu().numpy(),
                         Q2Vals=q2.squeeze().detach().cpu().numpy())
        
        return loss_q, loss_info

    # Set up function for computing DDPG pi loss
    def compute_loss_pi(seg_batch):
        o = seg_batch['seg_obs'].to(device=cuda)
        a = seg_batch['seg_act'].to(device=cuda)
        l = seg_batch['seg_len'].to(device=cuda)
        m = seg_batch['seg_mask'].to(device=cuda)
        
        pi_loss_type = "full_pred_pi"
        if pi_loss_type == "full_pred_pi":
            # 1. 
            pi = lstm_a(o, l)
            q1_pi = lstm_q1(o, pi, l, gather_last=False).squeeze(2)
            pi_loss = - (q1_pi*m).mean()
        elif pi_loss_type == "comb_pred_pi":
            # 2.
            # Combine past action and the predicted action
            pi = lstm_a(o, l)
            seg_act = deepcopy(a)
            seg_act[np.arange(a.shape[0]), (l-1).type(torch.LongTensor).to(device=cuda), :] = torch.gather(pi, 1, (l-1).view(-1,1).repeat(1, act_dim).unsqueeze(1).type(torch.LongTensor).to(device=cuda)).squeeze()

            q1_pi = lstm_q1(o, seg_act, l, gather_last=False).squeeze(2)
            pi_loss = - (q1_pi*m).mean()
        
        return pi_loss

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(lstm_a.parameters(), lr=pi_lr)
    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(lstm_q1.parameters(), lstm_q2.parameters())
    q_optimizer = Adam(q_params, lr=q_lr)

    # # Set up model saving
    # logger.setup_pytorch_saver(ac)

    def update(data):
        # First run one gradient descent step for Q.
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

#         # Freeze Q-network so you don't waste computational effort 
#         # computing gradients for it during the policy learning step.
#         for p in q_params:
#             p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

#         # Unfreeze Q-network so you can optimize it at next DDPG step.
#         for p in q_params:
#             p.requires_grad = True

        # Record things
        logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(lstm_a.parameters(), lstm_a_targ.parameters()):
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)
            for p, p_targ in zip(lstm_q1.parameters(), lstm_q1_targ.parameters()):
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)
            for p, p_targ in zip(lstm_q2.parameters(), lstm_q2_targ.parameters()):
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)


    def get_action(seg_o, seg_l, noise_scale):
        seg_obs = torch.tensor(seg_o).view(1, seg_o.shape[0], seg_o.shape[1]).float().to(device=cuda)
        seg_len = torch.tensor([seg_l]).float().to(device=cuda)
        with torch.no_grad():
            a = lstm_a(seg_obs, seg_len)
        a = torch.gather(a, 1, (seg_len-1).view(-1,1).repeat(1, act_dim).unsqueeze(1).type(torch.LongTensor).to(device=cuda)).squeeze(1)
        a = a.cpu().numpy()
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            o, ep_ret, ep_len = env.reset(), 0, 0
            o_buff = np.zeros([segment_len, obs_dim])
            o_buff[0,:] = o
            o_buff_len = 1
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o_buff, o_buff_len, 0))
                ep_ret += r
                ep_len += 1
                
                if o_buff_len == segment_len:
                    o_buff[:segment_len-1] = o_buff[1:]
                    o_buff[segment_len-1] = list(o)
                else:
                    o_buff[o_buff_len+1-1] = list(o)
                    o_buff_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
    o_buff = np.zeros([segment_len, obs_dim])
    o_buff[0,:] = o
    o_buff_len = 1
    
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        if t%200 == 0:
            print("t={}".format(t))
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards, 
        # use the learned policy (with some noise, via act_noise). 
        if t > start_steps:
            a = get_action(o_buff, o_buff_len, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        lstm_bf.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2
        
        
        if o_buff_len == segment_len:
            o_buff[:segment_len-1] = o_buff[1:]
            o_buff[segment_len-1] = list(o)
        else:
            o_buff[o_buff_len+1-1] = list(o)
            o_buff_len += 1
        

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0
            o_buff[0,:] = o
            o_buff_len = 1
    
        # Update handling
        if t >= update_after and t % update_every == 0:
            for _ in range(update_every):
                segment_batch = lstm_bf.sample_segment_batch(batch_size, segment_len)
                segment_batch['seg_obs'] = segment_batch['seg_obs'].cuda()
                segment_batch['seg_act'] = segment_batch['seg_act'].cuda()
                segment_batch['seg_rew'] = segment_batch['seg_rew'].cuda()
                segment_batch['seg_obs2'] = segment_batch['seg_obs2'].cuda()
                segment_batch['seg_done'] = segment_batch['seg_done'].cuda()
                segment_batch['seg_len'] = segment_batch['seg_len'].cuda()
                update(data=segment_batch)

        # End of epoch handling
        if (t+1) % steps_per_epoch == 0:
            epoch = (t+1) // steps_per_epoch

    #         # Save model
    #         if (epoch % save_freq == 0) or (epoch == epochs):
    #             logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()




In [117]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'segment_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Small_Net_SegL5_r1_CombPiTarg_All_Mean_PiLossFull_NoFreeze'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

ddpg(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     segment_len=args['segment_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Small_Net_SegL5_r1_CombPiTarg_All_Mean_PiLossFull_NoFreeze\lstm_td3_HalfCheetah_Small_Net_SegL5_r1_CombPiTarg_All_Mean_PiLossFull_NoFreeze_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x00000189952D7AF8>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Small_Net_SegL5_r1_CombPiTarg_All_Mean_PiLossFull_NoFreeze",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x00000189856C12C8>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Small_Net_SegL5_r1_CombPiTarg_All_Mean_PiLossFull_NoFreeze",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
          

t=12000
t=12200
t=12400
t=12600
t=12800
t=13000
t=13200
t=13400
t=13600
t=13800
---------------------------------------
|             Epoch |               7 |
|      AverageEpRet |             107 |
|          StdEpRet |            31.6 |
|          MaxEpRet |             138 |
|          MinEpRet |            75.2 |
|  AverageTestEpRet |            32.6 |
|      StdTestEpRet |            92.9 |
|      MaxTestEpRet |             132 |
|      MinTestEpRet |           -81.9 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.4e+04 |
|     AverageQ1Vals |           -91.7 |
|         StdQ1Vals |             167 |
|         MaxQ1Vals |            29.1 |
|         MinQ1Vals |           -74.4 |
|     AverageQ2Vals |           -91.7 |
|         StdQ2Vals |             167 |
|         MaxQ2Vals |            30.8 |
|         MinQ2Vals |           -75.8 |
|            LossPi |            14.6 |
|             LossQ |            6.21 |


t=28000
t=28200
t=28400
t=28600
t=28800
t=29000
t=29200
t=29400
t=29600
t=29800
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |             239 |
|          StdEpRet |            7.41 |
|          MaxEpRet |             247 |
|          MinEpRet |             232 |
|  AverageTestEpRet |            25.3 |
|      StdTestEpRet |             376 |
|      MaxTestEpRet |             322 |
|      MinTestEpRet |            -705 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |            -230 |
|         StdQ1Vals |             427 |
|         MaxQ1Vals |            23.2 |
|         MinQ1Vals |            -212 |
|     AverageQ2Vals |            -230 |
|         StdQ2Vals |             427 |
|         MaxQ2Vals |              25 |
|         MinQ2Vals |            -209 |
|            LossPi |            39.7 |
|             LossQ |            27.3 |


KeyboardInterrupt: 

In [114]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'segment_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Small_Net_SegL5_r1_CombPiTarg_All_Mean_PiLossFull'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

ddpg(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     segment_len=args['segment_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Small_Net_SegL5_r1_CombPiTarg_All_Mean_PiLossFull\lstm_td3_HalfCheetah_Small_Net_SegL5_r1_CombPiTarg_All_Mean_PiLossFull_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x00000189805563A8>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Small_Net_SegL5_r1_CombPiTarg_All_Mean_PiLossFull",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000001899501D208>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Small_Net_SegL5_r1_CombPiTarg_All_Mean_PiLossFull",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng

t=12000
t=12200
t=12400
t=12600
t=12800
t=13000
t=13200
t=13400
t=13600
t=13800
---------------------------------------
|             Epoch |               7 |
|      AverageEpRet |            97.7 |
|          StdEpRet |              75 |
|          MaxEpRet |             173 |
|          MinEpRet |            22.7 |
|  AverageTestEpRet |             310 |
|      StdTestEpRet |             235 |
|      MaxTestEpRet |             533 |
|      MinTestEpRet |           -60.7 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.4e+04 |
|     AverageQ1Vals |            -127 |
|         StdQ1Vals |             228 |
|         MaxQ1Vals |            27.8 |
|         MinQ1Vals |           -70.8 |
|     AverageQ2Vals |            -127 |
|         StdQ2Vals |             228 |
|         MaxQ2Vals |            26.6 |
|         MinQ2Vals |           -71.5 |
|            LossPi |            21.5 |
|             LossQ |            5.07 |


t=28000
t=28200
t=28400
t=28600
t=28800
t=29000
t=29200
t=29400
t=29600
t=29800
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |             564 |
|          StdEpRet |            16.1 |
|          MaxEpRet |             580 |
|          MinEpRet |             548 |
|  AverageTestEpRet |             529 |
|      StdTestEpRet |            47.8 |
|      MaxTestEpRet |             567 |
|      MinTestEpRet |             436 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |            -210 |
|         StdQ1Vals |             404 |
|         MaxQ1Vals |            56.2 |
|         MinQ1Vals |            -259 |
|     AverageQ2Vals |            -210 |
|         StdQ2Vals |             404 |
|         MaxQ2Vals |            57.8 |
|         MinQ2Vals |            -249 |
|            LossPi |            33.7 |
|             LossQ |            37.3 |


t=44000
t=44200
t=44400
t=44600
t=44800
t=45000
t=45200
t=45400
t=45600
t=45800
---------------------------------------
|             Epoch |              23 |
|      AverageEpRet |             578 |
|          StdEpRet |            5.31 |
|          MaxEpRet |             583 |
|          MinEpRet |             573 |
|  AverageTestEpRet |             537 |
|      StdTestEpRet |            37.8 |
|      MaxTestEpRet |             590 |
|      MinTestEpRet |             489 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.6e+04 |
|     AverageQ1Vals |            -450 |
|         StdQ1Vals |             881 |
|         MaxQ1Vals |             109 |
|         MinQ1Vals |            -525 |
|     AverageQ2Vals |            -450 |
|         StdQ2Vals |             881 |
|         MaxQ2Vals |             103 |
|         MinQ2Vals |            -523 |
|            LossPi |            76.5 |
|             LossQ |             172 |


t=60000
t=60200
t=60400
t=60600
t=60800
t=61000
t=61200
t=61400
t=61600
t=61800
---------------------------------------
|             Epoch |              31 |
|      AverageEpRet |             568 |
|          StdEpRet |            39.2 |
|          MaxEpRet |             607 |
|          MinEpRet |             528 |
|  AverageTestEpRet |             303 |
|      StdTestEpRet |             427 |
|      MaxTestEpRet |             578 |
|      MinTestEpRet |            -546 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.2e+04 |
|     AverageQ1Vals |            -881 |
|         StdQ1Vals |        1.72e+03 |
|         MaxQ1Vals |             144 |
|         MinQ1Vals |            -984 |
|     AverageQ2Vals |            -881 |
|         StdQ2Vals |        1.72e+03 |
|         MaxQ2Vals |             143 |
|         MinQ2Vals |            -995 |
|            LossPi |             156 |
|             LossQ |             623 |


KeyboardInterrupt: 

In [110]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'segment_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Small_Net_SegL5_r1_PredPiTarg_All_Mean_PiLossFull'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

ddpg(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     segment_len=args['segment_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Small_Net_SegL5_r1_PredPiTarg_All_Mean_PiLossFull\lstm_td3_HalfCheetah_Small_Net_SegL5_r1_PredPiTarg_All_Mean_PiLossFull_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x000001898052CC18>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Small_Net_SegL5_r1_PredPiTarg_All_Mean_PiLossFull",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000001899500D548>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Small_Net_SegL5_r1_PredPiTarg_All_Mean_PiLossFull",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng

t=12000
t=12200
t=12400
t=12600
t=12800
t=13000
t=13200
t=13400
t=13600
t=13800
---------------------------------------
|             Epoch |               7 |
|      AverageEpRet |            67.1 |
|          StdEpRet |            17.2 |
|          MaxEpRet |            84.3 |
|          MinEpRet |            49.9 |
|  AverageTestEpRet |           -25.1 |
|      StdTestEpRet |            33.4 |
|      MaxTestEpRet |            20.8 |
|      MinTestEpRet |           -83.1 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.4e+04 |
|     AverageQ1Vals |            -113 |
|         StdQ1Vals |             202 |
|         MaxQ1Vals |            11.3 |
|         MinQ1Vals |           -42.7 |
|     AverageQ2Vals |            -113 |
|         StdQ2Vals |             202 |
|         MaxQ2Vals |            11.6 |
|         MinQ2Vals |           -42.6 |
|            LossPi |            22.3 |
|             LossQ |            1.38 |


t=28000
t=28200
t=28400
t=28600
t=28800
t=29000
t=29200
t=29400
t=29600
t=29800
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |             642 |
|          StdEpRet |            1.61 |
|          MaxEpRet |             644 |
|          MinEpRet |             641 |
|  AverageTestEpRet |             725 |
|      StdTestEpRet |              47 |
|      MaxTestEpRet |             803 |
|      MinTestEpRet |             658 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |           -80.9 |
|         StdQ1Vals |             154 |
|         MaxQ1Vals |            21.2 |
|         MinQ1Vals |            -117 |
|     AverageQ2Vals |           -80.9 |
|         StdQ2Vals |             154 |
|         MaxQ2Vals |            21.6 |
|         MinQ2Vals |            -117 |
|            LossPi |            15.9 |
|             LossQ |            5.78 |


t=44000
t=44200
t=44400
t=44600
t=44800
t=45000
t=45200
t=45400
t=45600
t=45800
---------------------------------------
|             Epoch |              23 |
|      AverageEpRet |             713 |
|          StdEpRet |              49 |
|          MaxEpRet |             762 |
|          MinEpRet |             664 |
|  AverageTestEpRet |             823 |
|      StdTestEpRet |            62.8 |
|      MaxTestEpRet |             902 |
|      MinTestEpRet |             723 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.6e+04 |
|     AverageQ1Vals |             141 |
|         StdQ1Vals |             260 |
|         MaxQ1Vals |            65.2 |
|         MinQ1Vals |            -128 |
|     AverageQ2Vals |             141 |
|         StdQ2Vals |             260 |
|         MaxQ2Vals |            65.6 |
|         MinQ2Vals |            -129 |
|            LossPi |           -28.4 |
|             LossQ |            8.68 |


t=60000
t=60200
t=60400
t=60600
t=60800
t=61000
t=61200
t=61400
t=61600
t=61800
---------------------------------------
|             Epoch |              31 |
|      AverageEpRet |             858 |
|          StdEpRet |            13.6 |
|          MaxEpRet |             871 |
|          MinEpRet |             844 |
|  AverageTestEpRet |             938 |
|      StdTestEpRet |            32.8 |
|      MaxTestEpRet |             969 |
|      MinTestEpRet |             881 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.2e+04 |
|     AverageQ1Vals |             387 |
|         StdQ1Vals |             701 |
|         MaxQ1Vals |             169 |
|         MinQ1Vals |            -208 |
|     AverageQ2Vals |             387 |
|         StdQ2Vals |             701 |
|         MaxQ2Vals |             169 |
|         MinQ2Vals |            -205 |
|            LossPi |           -77.8 |
|             LossQ |            17.8 |


t=76000
t=76200
t=76400
t=76600
t=76800
t=77000
t=77200
t=77400
t=77600
t=77800
---------------------------------------
|             Epoch |              39 |
|      AverageEpRet |             897 |
|          StdEpRet |            43.2 |
|          MaxEpRet |             940 |
|          MinEpRet |             854 |
|  AverageTestEpRet |             638 |
|      StdTestEpRet |            68.9 |
|      MaxTestEpRet |             700 |
|      MinTestEpRet |             526 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         7.8e+04 |
|     AverageQ1Vals |             637 |
|         StdQ1Vals |        1.15e+03 |
|         MaxQ1Vals |             285 |
|         MinQ1Vals |            -224 |
|     AverageQ2Vals |             637 |
|         StdQ2Vals |        1.15e+03 |
|         MaxQ2Vals |             286 |
|         MinQ2Vals |            -228 |
|            LossPi |            -128 |
|             LossQ |            21.1 |


t=92000
t=92200
t=92400
t=92600
t=92800
t=93000
t=93200
t=93400
t=93600
t=93800
---------------------------------------
|             Epoch |              47 |
|      AverageEpRet |        1.28e+03 |
|          StdEpRet |             371 |
|          MaxEpRet |        1.65e+03 |
|          MinEpRet |             904 |
|  AverageTestEpRet |             960 |
|      StdTestEpRet |            27.2 |
|      MaxTestEpRet |             993 |
|      MinTestEpRet |             915 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         9.4e+04 |
|     AverageQ1Vals |             905 |
|         StdQ1Vals |        1.62e+03 |
|         MaxQ1Vals |             401 |
|         MinQ1Vals |            -168 |
|     AverageQ2Vals |             905 |
|         StdQ2Vals |        1.62e+03 |
|         MaxQ2Vals |             401 |
|         MinQ2Vals |            -169 |
|            LossPi |            -181 |
|             LossQ |            19.4 |


In [109]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'segment_len': 5,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Small_Net_SegL5_r1_PredPiTarg_All_Mean_PiLossCombined'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

ddpg(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     segment_len=args['segment_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Small_Net_SegL5_r1_PredPiTarg_All_Mean_PiLossCombined\lstm_td3_HalfCheetah_Small_Net_SegL5_r1_PredPiTarg_All_Mean_PiLossCombined_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x00000189856DE1F8>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Small_Net_SegL5_r1_PredPiTarg_All_Mean_PiLossCombined",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x000001898567B348>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Small_Net_SegL5_r1_PredPiTarg_All_Mean_PiLossCombined",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\

t=12000
t=12200
t=12400
t=12600
t=12800
t=13000
t=13200
t=13400
t=13600
t=13800
---------------------------------------
|             Epoch |               7 |
|      AverageEpRet |            -230 |
|          StdEpRet |            41.1 |
|          MaxEpRet |            -189 |
|          MinEpRet |            -271 |
|  AverageTestEpRet |            -260 |
|      StdTestEpRet |            69.2 |
|      MaxTestEpRet |            -155 |
|      MinTestEpRet |            -346 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.4e+04 |
|     AverageQ1Vals |            -120 |
|         StdQ1Vals |             215 |
|         MaxQ1Vals |            15.1 |
|         MinQ1Vals |           -43.4 |
|     AverageQ2Vals |            -120 |
|         StdQ2Vals |             215 |
|         MaxQ2Vals |            15.4 |
|         MinQ2Vals |           -43.8 |
|            LossPi |            23.7 |
|             LossQ |            1.75 |


t=28000
t=28200
t=28400
t=28600
t=28800
t=29000
t=29200
t=29400
t=29600
t=29800
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |             230 |
|          StdEpRet |            26.9 |
|          MaxEpRet |             257 |
|          MinEpRet |             203 |
|  AverageTestEpRet |             186 |
|      StdTestEpRet |            62.8 |
|      MaxTestEpRet |             257 |
|      MinTestEpRet |             109 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |            -385 |
|         StdQ1Vals |             690 |
|         MaxQ1Vals |           -39.8 |
|         MinQ1Vals |            -138 |
|     AverageQ2Vals |            -385 |
|         StdQ2Vals |             690 |
|         MaxQ2Vals |           -40.8 |
|         MinQ2Vals |            -137 |
|            LossPi |            76.7 |
|             LossQ |            4.36 |


t=44000
t=44200
t=44400
t=44600
t=44800
t=45000
t=45200
t=45400
t=45600
t=45800
---------------------------------------
|             Epoch |              23 |
|      AverageEpRet |             340 |
|          StdEpRet |            8.84 |
|          MaxEpRet |             349 |
|          MinEpRet |             331 |
|  AverageTestEpRet |             324 |
|      StdTestEpRet |            44.1 |
|      MaxTestEpRet |             372 |
|      MinTestEpRet |             241 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         4.6e+04 |
|     AverageQ1Vals |            -497 |
|         StdQ1Vals |             892 |
|         MaxQ1Vals |           -35.9 |
|         MinQ1Vals |            -219 |
|     AverageQ2Vals |            -497 |
|         StdQ2Vals |             892 |
|         MaxQ2Vals |           -35.3 |
|         MinQ2Vals |            -220 |
|            LossPi |              99 |
|             LossQ |            6.91 |


t=60000
t=60200
t=60400
t=60600
t=60800
t=61000
t=61200
t=61400
t=61600
t=61800
---------------------------------------
|             Epoch |              31 |
|      AverageEpRet |             377 |
|          StdEpRet |            3.03 |
|          MaxEpRet |             380 |
|          MinEpRet |             374 |
|  AverageTestEpRet |             361 |
|      StdTestEpRet |            15.9 |
|      MaxTestEpRet |             388 |
|      MinTestEpRet |             338 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         6.2e+04 |
|     AverageQ1Vals |            -557 |
|         StdQ1Vals |        1.01e+03 |
|         MaxQ1Vals |           -15.2 |
|         MinQ1Vals |            -285 |
|     AverageQ2Vals |            -557 |
|         StdQ2Vals |        1.01e+03 |
|         MaxQ2Vals |           -15.1 |
|         MinQ2Vals |            -285 |
|            LossPi |             111 |
|             LossQ |            9.56 |


KeyboardInterrupt: 

In [102]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 
        'segment_len': 1,
        'gamma': 0.99, 'seed': 0, 'epochs': 50, 
        'exp_name': 'lstm_td3_HalfCheetah_Small_Net_SegL1_r1_PredPiTarg_All_Mean'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

ddpg(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     segment_len=args['segment_len'],
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\lstm_td3_HalfCheetah_Small_Net_SegL1_r1_PredPiTarg_All_Mean\lstm_td3_HalfCheetah_Small_Net_SegL1_r1_PredPiTarg_All_Mean_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x000001898546CEE8>",
    "epochs":	50,
    "exp_name":	"lstm_td3_HalfCheetah_Small_Net_SegL1_r1_PredPiTarg_All_Mean",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000018980482308>":	{
            "epoch_dict":	{},
            "exp_name":	"lstm_td3_HalfCheetah_Small_Net_SegL1_r1_PredPiTarg_All_Mean",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\google drive\\git_repos\\spinningup-new\\d

t=12000
t=12200
t=12400
t=12600
t=12800
t=13000
t=13200
t=13400
t=13600
t=13800
---------------------------------------
|             Epoch |               7 |
|      AverageEpRet |             748 |
|          StdEpRet |             115 |
|          MaxEpRet |             864 |
|          MinEpRet |             633 |
|  AverageTestEpRet |             988 |
|      StdTestEpRet |            66.1 |
|      MaxTestEpRet |        1.11e+03 |
|      MinTestEpRet |             907 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |         1.4e+04 |
|     AverageQ1Vals |            10.4 |
|         StdQ1Vals |            13.1 |
|         MaxQ1Vals |            38.6 |
|         MinQ1Vals |           -21.9 |
|     AverageQ2Vals |            10.4 |
|         StdQ2Vals |            13.1 |
|         MaxQ2Vals |            40.2 |
|         MinQ2Vals |           -22.5 |
|            LossPi |           -11.6 |
|             LossQ |            2.25 |


t=28000
t=28200
t=28400
t=28600
t=28800
t=29000
t=29200
t=29400
t=29600
t=29800
---------------------------------------
|             Epoch |              15 |
|      AverageEpRet |        2.92e+03 |
|          StdEpRet |             156 |
|          MaxEpRet |        3.07e+03 |
|          MinEpRet |        2.76e+03 |
|  AverageTestEpRet |        3.24e+03 |
|      StdTestEpRet |            85.8 |
|      MaxTestEpRet |        3.34e+03 |
|      MinTestEpRet |        3.08e+03 |
|             EpLen |           1e+03 |
|         TestEpLen |           1e+03 |
| TotalEnvInteracts |           3e+04 |
|     AverageQ1Vals |             154 |
|         StdQ1Vals |            56.6 |
|         MaxQ1Vals |             233 |
|         MinQ1Vals |            4.29 |
|     AverageQ2Vals |             154 |
|         StdQ2Vals |            56.6 |
|         MaxQ2Vals |             234 |
|         MinQ2Vals |            3.99 |
|            LossPi |            -157 |
|             LossQ |            16.5 |


KeyboardInterrupt: 

## MLP-TD3

In [270]:

def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 
         steps_per_epoch=2000, epochs=100, replay_size=int(1e6), gamma=0.99, 
         polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=1000, 
         update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=5, 
         max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
    """
    Deep Deterministic Policy Gradient (DDPG)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, and a ``q`` module. The ``act`` method and
            ``pi`` module should accept batches of observations as inputs,
            and ``q`` should accept a batch of observations and a batch of 
            actions as inputs. When called, these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q``        (batch,)          | Tensor containing the current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    segment_len=1
    
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    mlp_q1 = MLPCritic(obs_dim, act_dim)
    mlp_q2 = MLPCritic(obs_dim, act_dim)
    mlp_a = MLPActor(obs_dim, act_dim, act_limit)
    mlp_q1_targ = deepcopy(mlp_q1)
    mlp_q2_targ = deepcopy(mlp_q2)
    mlp_a_targ = deepcopy(mlp_a)
    mlp_q1.cuda()
    mlp_q2.cuda()
    mlp_a.cuda()
    mlp_q1_targ.cuda()
    mlp_q2_targ.cuda()
    mlp_a_targ.cuda()


    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in mlp_q1_targ.parameters():
        p.requires_grad = False
    for p in mlp_q2_targ.parameters():
        p.requires_grad = False
    for p in mlp_a_targ.parameters():
        p.requires_grad = False

    # Experience buffer
    lstm_bf = LSTMReplayBuffer(obs_dim, act_dim, int(1e6))
    
    # Set up function for computing DDPG Q-loss
    def compute_loss_q(batch):
        q1 = mlp_q1(batch['obs'].to(device=cuda), batch['act'].to(device=cuda))
        q2 = mlp_q2(batch['obs'].to(device=cuda), batch['act'].to(device=cuda))

        # Bellman backup for Q function
        with torch.no_grad():
            pi_targ = mlp_a_targ(batch['obs2'].to(device=cuda))
            q1_pi_targ = mlp_q1_targ(batch['obs2'].to(device=cuda), pi_targ)
            q2_pi_targ = mlp_q2_targ(batch['obs2'].to(device=cuda), pi_targ)

            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = batch['rew'].to(device=cuda) + gamma * (1-batch['done'].to(device=cuda)) * q_pi_targ
        import pdb
        pdb.set_trace()
        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2
        
        # Useful info for logging
        loss_info = dict(Q1Vals=q1.squeeze().detach().cpu().numpy(),
                         Q2Vals=q2.squeeze().detach().cpu().numpy())
        
        return loss_q, loss_info

    # Set up function for computing DDPG pi loss
    def compute_loss_pi(batch):
        # 1. 
        pi = mlp_a(batch['obs'].cuda())
        q1_pi = mlp_q1(batch['obs'].cuda(), pi)
        return -q1_pi.mean()

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(mlp_a.parameters(), lr=pi_lr)
    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(mlp_q1.parameters(), mlp_q2.parameters())
    q_optimizer = Adam(q_params, lr=q_lr)

    # # Set up model saving
    # logger.setup_pytorch_saver(ac)

    def update(data):
        # First run one gradient descent step for Q.
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Freeze Q-network so you don't waste computational effort 
        # computing gradients for it during the policy learning step.
        for p in q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-network so you can optimize it at next DDPG step.
        for p in q_params:
            p.requires_grad = True

        # Record things
        logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(mlp_a.parameters(), mlp_a_targ.parameters()):
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)
            for p, p_targ in zip(mlp_q1.parameters(), mlp_q1_targ.parameters()):
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)
            for p, p_targ in zip(mlp_q2.parameters(), mlp_q2_targ.parameters()):
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)


    def get_action(o, noise_scale):
        o = torch.tensor(o).view(1, -1).float().to(device=cuda)
        with torch.no_grad():
            a = mlp_a(o)
        a = a.cpu().numpy()
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
                
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
    
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        if t%200 == 0:
            print("t={}".format(t))
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards, 
        # use the learned policy (with some noise, via act_noise). 
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        lstm_bf.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0
    
        # Update handling
        if t >= update_after and t % update_every == 0:
            for _ in range(update_every):
                segment_batch = lstm_bf.sample_batch(batch_size)
                segment_batch['obs'].cuda()
                segment_batch['act'].cuda()
                segment_batch['rew'].cuda()
                segment_batch['obs2'].cuda()
                segment_batch['done'].cuda()
                update(data=segment_batch)

        # End of epoch handling
        if (t+1) % steps_per_epoch == 0:
            epoch = (t+1) // steps_per_epoch

    #         # Save model
    #         if (epoch % save_freq == 0) or (epoch == epochs):
    #             logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()




In [271]:
args = {'env': 'HalfCheetah-v2', 'hid': 256, 'l': 2, 'gamma': 0.99, 'seed': 0, 'epochs': 50, 'exp_name': 'mlp_td3_HalfCheetah'}

from spinup.utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args['exp_name'], args['seed'])

ddpg(lambda : gym.make(args['env']), actor_critic=core.MLPActorCritic,
     ac_kwargs=dict(hidden_sizes=[args['hid']]*args['l']), 
     gamma=args['gamma'], seed=args['seed'], epochs=args['epochs'],
     logger_kwargs=logger_kwargs)

[32;1mLogging data to c:\users\lingheng\google drive\git_repos\spinningup-new\data\mlp_td3_HalfCheetah\mlp_td3_HalfCheetah_s0\progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "hidden_sizes":	[
            256,
            256
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"MLPActorCritic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x0000021AC68F8DC8>",
    "epochs":	50,
    "exp_name":	"mlp_td3_HalfCheetah",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x0000021AC81D4408>":	{
            "epoch_dict":	{},
            "exp_name":	"mlp_td3_HalfCheetah",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"c:\\users\\lingheng\\google drive\\git_repos\\spinningup-new\\data\\mlp_td3_HalfCheetah\\mlp_td3_HalfCheetah_s0",
            "output_file":	{
                "<_io.TextIOWrapper name='c:\\\\users\\\\lingheng\\\\google driv

BdbQuit: 