In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from IPython.display import clear_output
import matplotlib.pyplot as plt

from timeit import default_timer as timer

https://pytorch.org/docs/stable/generated/torch.nn.TripletMarginLoss.html
https://arxiv.org/pdf/2010.02663.pdf
https://github.com/ucla-rlcourse/DeepRL-Tutorials/blob/master/12.A2C.ipynb

Configurations

In [7]:
COLLISION_PENALTY  = -10
COMPLETION_REWARD  = 100
VISITATION_PENALTY = -10
NUM_STEPS = 100
ENCODER_OUTPUT_DIM = 128
ROLLOUT = 100
GAMMA = 0.99
LR = 7e-4
ENTROPY_LOSS_WEIGHT = 0.01
VALUE_LOSS_WEIGHT = 0.5
GRAD_NORM_MAX = 0.5
NUM_ACTIONS = 8
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") #cuda is gpu
NUM_AGENTS = 3
ENV_STEPS = 10000

seed = 1

torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
torch.set_num_threads(1)
np.set_printoptions(linewidth=200)
move_dict = {0:(1,0), 1:(1,1), 2:(0,1), 3:(-1,1), 4:(-1,0), 5:(-1,-1), 6:(0,-1), 7:(1,-1)}

Rollout Storage

In [8]:
class RolloutStorage(object):
    def __init__(self, num_steps, num_processes, obs_shape, state_shape, action_space, device):
        self.observations = torch.zeros(num_steps + 1, num_processes, *obs_shape).to(device) # Repurpose number of processes to be number of environments
        self.state = torch.zeros(num_steps + 1, *state_shape).to(device)
        self.rewards = torch.zeros(num_steps, num_processes, 1).to(device)
        self.value_preds = torch.zeros(num_steps + 1, num_processes, 1).to(device)
        self.returns = torch.zeros(num_steps + 1, num_processes, 1).to(device)
        self.action_log_probs = torch.zeros(num_steps, num_processes, 1).to(device)
        self.actions = torch.zeros(num_steps, num_processes, 1).to(device, torch.long)
        self.masks = torch.ones(num_steps + 1, num_processes, 1).to(device)

        self.num_steps = num_steps
        self.step = 0

    def insert(self, current_obs, action, action_log_prob, value_pred, reward, mask, state):
        """Insert values into rollout storage"""
        self.observations[self.step + 1].copy_(current_obs)
        self.actions[self.step].copy_(action)
        self.action_log_probs[self.step].copy_(action_log_prob)
        self.value_preds[self.step].copy_(value_pred)
        self.rewards[self.step].copy_(reward)
        self.masks[self.step + 1].copy_(mask)
        self.state[self.step].copy_(state)

        self.step = (self.step + 1) % self.num_steps

    def after_update(self):
        self.observations[0].copy_(self.observations[-1])
        self.masks[0].copy_(self.masks[-1])

    def compute_returns(self, next_value, gamma):
        self.returns[-1] = next_value
        for step in reversed(range(self.rewards.size(0))):
            self.returns[step] = self.returns[step + 1] * \
                gamma * self.masks[step + 1] + self.rewards[step]

Networks

In [9]:
'''
Embedding Network: Each agent encodes observations using a fully connected layer to 
enable heterogeneous teams
input_dim = sensor package x sensor_package (e.g. 6*6=36 oberservations)
output_dim > input_dim with a fixed feature length across all actors

Input: Observation of size sensor package
Output: Fixed length feature vector
'''
class Encoder(nn.Module):
  def __init__(self, input_dim, output_dim):
    super(Encoder, self).__init__()

    self.embedding = nn.Linear(input_dim, output_dim)
  def forward(self, obs):
    if len(obs.size()) == 3:
      obs = torch.flatten(obs, start_dim=1)
    else:
      obs = torch.flatten(obs)
    
    x = self.embedding(obs)
    return x 


'''
Critic Network: is a shared centralized critic to estimate value fuction V(St,ø) found by minimizing MSE loss
Critic parameters ø are updated by mini-batch gradient descent
At execution time, the critic is removed and agents excute their policies individually

Input: State Information
Output: Value estimate of the state
'''
class CriticNetwork(nn.Module):
  def __init__(self, state_size, hidden_units=100):
    super(CriticNetwork, self).__init__()
    state_size = np.prod(state_size)

    self.critic = nn.Sequential(
        nn.Linear(state_size, hidden_units),
        nn.ReLU(),
        nn.Linear(hidden_units, hidden_units),
        nn.ReLU(),
        nn.Linear(hidden_units, 1)
    )

  def forward(self, state):
    if len(state.size()) == 3:
      state = torch.flatten(state, start_dim=1)
    else:
      state = torch.flatten(state)

    value = self.critic(state) 
    return value

'''
Critic Network

Input: State Information Unflattened
Output: Value estimate of the state
'''
class CNNCritic(nn.Module):
  def __init__(self, state_size, hidden_units=100):
    super(CriticNetwork, self).__init__()

    self.critic = nn.Sequential(
        nn.Conv2d(state_size, hidden_units, kernel_size=3, stride=1),
        nn.ReLU(),
        nn.Conv2d(hidden_units, hidden_units*2, kernel_size=3, stride=1),
        nn.ReLU(),
        nn.MaxPool2d(hidden_units*2, hidden_units//2),
        nn.Linear(hidden_units//2, 1) 
    )

  def forward(self, state):
    if len(state.size()) == 3:
      state = torch.flatten(state, start_dim=0)
    else:
      state = state

    value = self.critic(state) 
    return value
  
'''
Actor Network: represents the agent policy function

Input: fixed-length encoded observation so it's shareable among heterogenous agents
Output: Action Logit
'''
class ActorNetwork(nn.Module):
  def __init__(self, input_dim, output_dim, num_actions, hidden_units=100):
    super(ActorNetwork, self).__init__()

    self.encoder = Encoder(input_dim, output_dim)
    self.actor = nn.Sequential(
        nn.Linear(output_dim, hidden_units),
        nn.ReLU(),
        nn.Linear(hidden_units, hidden_units),
        nn.ReLU(),
        nn.Linear(hidden_units, num_actions)
    )
    self.train()

  def forward(self, obs):
    x = self.encoder(obs)
    logits = self.actor(x)
    return logits

In [10]:
class ExplorerAgent(object):
  def __init__(self, loc, sensor_package=3):
    self.x = loc[0]
    self.y = loc[1]
    self.sensor_package = sensor_package

  def declare_networks(self):
    self.model = ActorNetwork((self.sensor_package*2)**2, ENCODER_OUTPUT_DIM, NUM_ACTIONS)
    self.optimizer = optim.RMSprop(self.model.parameters(), lr=LR, alpha=0.99, eps=1e-5)
    self.model = self.model.to(DEVICE)

  def move(self, action):
    move = move_dict[action]
    self.x += move[0]
    self.y += move[1]

  def set_location(self, loc):
    self.x = loc[0]
    self.y = loc[1]

  def __str__(self):
    return "Agent at x=" + str(self.x) + " and y=" + str(self.y) + " with sensor package=" + str(self.sensor_package)

Setting up Environment & Reward




In [11]:
class UnknownTerrain(object):
  '''
  0 = passable
  1 = impassible or out of bounds
  2 = other agent
  '''
  def __init__(self, size=16, density=0.1, num_agents=3, padding=3):
    self.size = size
    self.padding = padding #if agent is at the edge of grid the environment will pad out impassable terrian so 
    # network has something to read
    self.density = density #how much impassable terrian there is 
    self.observation_space = (padding * 2, padding * 2)
    self.state_space = (size, size)
    self.action_space = 8 # 8 cardinal directions
    self.grid = np.zeros((size, size)).astype(np.uint8)
    block_x = np.random.randint(0, size, size=int((size**2) * density))
    block_y = np.random.randint(0, size, size=int((size**2) * density))
    block_p = np.array(list(zip(block_x, block_y))) #generate random locations for obstacles
    for x, y in block_p:
      self.grid[x, y] = 1
    self.num_agents = num_agents
    self.locations = np.dstack(np.where(self.grid != 1)).squeeze() #generates viable agent placement locations
    self.agents = [ExplorerAgent(self.locations[np.random.choice(self.locations.shape[0])]) for i in range(0, num_agents)]

    self.grid = np.pad(self.grid, pad_width=padding, mode="constant", constant_values=1) # pad out grid
    #padding out grid
    for agent in self.agents:
      agent.x += padding # shift agents over with padding
      agent.y += padding
      self.grid[agent.x, agent.y] = 2 # assign grid location to agent


    self.uncovered_grid = np.zeros((size, size)).astype(np.uint8)
    self.rewards = {agent:0 for agent in self.agents} # Dictionary containing agents and their last step information
    self.step_start_uncovered_ratio = 0.0 
    self.last_move_uncovered_ratio = 0.0
    self.step = 0
    self.max_steps = NUM_STEPS


  '''
  Reset all changing information
  '''
  def reset(self):
    self.grid = np.zeros((self.size, self.size)).astype(np.uint8)
    block_x = np.random.randint(0, self.size, size=int((self.size**2) * self.density))
    block_y = np.random.randint(0, self.size, size=int((self.size**2) * self.density))
    block_p = np.array(list(zip(block_x, block_y)))
    for x, y in block_p:
      self.grid[x, y] = 1
    self.locations = np.dstack(np.where(self.grid != 1)).squeeze()
    self.grid = np.pad(self.grid, pad_width=self.padding, mode="constant", constant_values=1) # need to adjust agent positions
    for agent in self.agents:
      agent.set_location(self.locations[np.random.choice(self.locations.shape[0])]) # Randomize location again
      agent.x += self.padding 
      agent.y += self.padding
      self.grid[agent.x, agent.y] = 2

    self.uncovered_grid = np.zeros((self.size, self.size)).astype(np.uint8)
    self.rewards = {agent:0 for agent in self.agents}
    self.step_start_uncovered_ratio = 0.0 
    self.last_move_uncovered_ratio = 0.0
    self.step = 0

    return self.observe_()


  '''
  Action format is values 0-7
  
  0 = (1, 0)
  1 = (1, 1)
  2 = (0, 1)
  3 = (-1,1)
  4 = (-1,0)
  5 = (-1,-1)
  6 = (0,-1)
  7 = (1,-1)
  '''
  def is_valid_(self, agent, action): # need to make sure that the move works, otherwise agent doesn't move and receives penalty
    assert 0 <= action <= 7
    move = move_dict[action]

    return self.grid[agent.x + move[0], agent.y + move[1]] == 0 # anything that isn't passable isn't considered invalid

  '''
  Ends if we have completed entire grid or we exceed the max number of steps
  '''
  def env_done_(self):
    return np.all(self.grid == 1) or self.step >= self.max_steps

  def calc_uncovered_ratio_(self):
    return np.sum(self.uncovered_grid) / (self.size * self.size)

  '''
  joint_action = list of actions to gake for each agent

  action cannot move into the space of inpassable terrain / action cannot occupy the space of another agent / cannot go out of bounds
  out of bounds is encoded as impassible terrain
  '''
  def step_(self, joint_actions):
    done = False
    info = { agent:
              {
                "complete":0, "group_uncover":0, "individual_uncover":0, "visitation_penalty":0, "collision_penalty":0
              } for agent in self.agents
           }

    for agent, action in zip(self.agents, joint_actions):
      action = action.item()
      if not self.is_valid_(agent, action): # Don't move if the action isn't valid
        self.rewards[agent] += COLLISION_PENALTY # Reward 5
        info[agent]["collision_penalty"] = COLLISION_PENALTY
      else: # If the movement is valid that take it
        self.grid[agent.x, agent.y] = 0 # We assume the cell it occupied was passable so just set it back
        agent.move(action)
        self.grid[agent.x, agent.y] = 2 # Update the new cell as containing the agent

      self.set_observed_cells_(agent) # fill in the cells that the agent has observed
      if self.env_done_(): # If the entire grid has been observed then terminate
        done = True
        self.rewards[agent] += COMPLETION_REWARD # Reward 1
        info[agent]["complete"] = COMPLETION_REWARD

      current_uncovered_ratio = self.calc_uncovered_ratio_()
      individual_fraction_uncovered_reward = current_uncovered_ratio - self.last_move_uncovered_ratio
      self.last_move_uncovered_ratio = current_uncovered_ratio

      if individual_fraction_uncovered_reward <= 1e-5:
        self.rewards[agent] += VISITATION_PENALTY # Reward 4
        info[agent]["visitation_penalty"] = VISITATION_PENALTY
      else:
        self.rewards[agent] += individual_fraction_uncovered_reward # Reward 3
        info[agent]["individual_uncover"] = individual_fraction_uncovered_reward

    final_uncovered_ratio = self.calc_uncovered_ratio_() # compute final uncovered reward ratio for agent
    timestep_fraction_uncovered_reward = final_uncovered_ratio - self.step_start_uncovered_ratio
    self.step_start_uncovered_ratio = final_uncovered_ratio

    for agent in self.agents: # update each agents reward
      self.rewards[agent] += timestep_fraction_uncovered_reward # Reward 2
      info[agent]["group_uncover"] = timestep_fraction_uncovered_reward

    self.step += 1 # go to the next step
    obs = self.observe_()
    reward = self.rewards_(info)
    done = np.array([done] * NUM_AGENTS)
    return obs, reward, done

  '''
  Reward is computed on 5 elements

  1 = team-based terminal reward given after completing the grid
  2 = team-based progress reward based on the fraction of uncovered cells during timestep
  3 = individual discovery reward for cells uncovered
  4 = individual visitation penalty if agent didn't uncover any cells
  5 = individual collision penalty if agent collided with terrain or went out of bounds
  '''
  def get_agent_reward_(self, agent):
    return self.rewards[agent]

  '''
  Return, as tuple, all the observations for the agents as defined by their sensor_package
  '''
  def observe_(self):
    obs = []
    for agent in self.agents:
      obs.append(self.grid[agent.x-agent.sensor_package:agent.x+agent.sensor_package, agent.y-agent.sensor_package:agent.y+agent.sensor_package])
    return np.array(obs)

  def rewards_(self, reward_dict):
    #adding up rewards in the current step
    rewards = []
    for agent, reward in reward_dict.items():
      rewards.append(np.sum(list(reward.values())))
    return np.array(rewards).astype(np.float32)


  def set_observed_cells_(self, agent):
    x,y = agent.x, agent.y
    # actual grid is padded but observed cells are not so we unpad
    x -= self.padding # readjust to account for padding offset
    y -= self.padding
    x_min, x_max = x-agent.sensor_package, x+agent.sensor_package
    y_min, y_max = y-agent.sensor_package, y+agent.sensor_package

    x_min, x_max = np.maximum(0, x_min), np.minimum(self.size, x_max) # make sure that we aren't going passed the boundaries
    y_min, y_max = np.maximum(0, y_min), np.minimum(self.size, y_max)
    self.uncovered_grid[x_min: x_max, y_min: y_max] = 1

  def get_state_(self):
    #extracting grid without padding
    return self.grid[0+self.padding:self.size+self.padding, 0+self.padding:self.size+self.padding]

  def render_(self):
    for i in range(self.padding, self.size + self.padding):
      for j in range(self.padding, self.size + self.padding):
        current = self.grid[i, j]
        if current == 0:
          print(" ", end="")#passable terrin
        elif current == 1:
          print("■", end="")#obstacles
        else:
          print("X", end="")#agent
      print()

  

Setting up Agent

In [12]:
class Model(object):
  def __init__(self, static_policy=False, env=None):
    self.device = DEVICE

    self.gamma = GAMMA
    self.lr = LR
    self.value_loss_weight = VALUE_LOSS_WEIGHT
    self.entropy_loss_weight = ENTROPY_LOSS_WEIGHT
    self.rollout = ROLLOUT
    self.grad_norm_max = GRAD_NORM_MAX

    self.static_policy = static_policy
   
    self.env = env
    self.agents = env.agents
    self.num_feats = env.observation_space #just number of features
    self.num_actions = env.action_space
    self.state_space = env.state_space
    
    self.critic_network = CriticNetwork(self.state_space)
    self.optimizer = optim.RMSprop(self.critic_network.parameters(), lr=LR, alpha=0.99, eps=1e-5)
    self.declare_networks() #declare networks for num_agents
    self.rollouts = self.rollouts = RolloutStorage(self.rollout, NUM_AGENTS,
            self.num_feats, self.state_space, self.env.action_space, self.device)

    self.value_losses = {}
    self.entropy_losses = {}
    self.policy_losses = {}
    
  def declare_networks(self):
    for agent in self.agents:
      agent.declare_networks()

  def get_action(self, obs, state, deterministic=True): # FINISHED?
    values, actions, action_log_probs = [], [], []
    for agent, current_obs in zip(self.agents, obs):
      logits = agent.model(current_obs)
      value  = self.critic_network(state)
      dist = torch.distributions.Categorical(logits=logits)

      if deterministic: #if u want agent to always do the same thing then just take the highest value from distribution
          action = dist.probs.argmax(keepdim=True) # removed dim=1
      else:
          action = dist.sample().view(-1, 1) #other sample from distribution

      #action_log_prob = F.log_softmax(logits, dim=0) # removed dim=1, used to be log_probs
      #action_log_prob = log_probs.gather(1, actions)

      log_prob = F.log_softmax(logits, dim=0) #percent of chance of this action being good under current policy
      action_log_prob = log_prob.gather(0, action) #reformat logprob so its in the shape of action, if there are 8 actions there should be 8 probabilities 

      values.append(value)
      actions.append(action)
      action_log_probs.append(action_log_prob)
    
    return torch.stack(values), torch.stack(actions), torch.stack(action_log_probs)

  def get_actions_only(self):
    actions = []
    for agent, o in (self.agents, env.observe_()):
      action = agent.model(o)
      actions.append(action)
    return actions # Decouples the critic from the value function

  def evaluate_actions(self, obs, state, actions):
    values, action_log_probs, dist_entropys = [], [], []
    for agent, current_obs, action in zip(self.agents, obs, actions):
      logits = agent.model(current_obs)
      value = self.critic_network(state)

      dist = torch.distributions.Categorical(logits=logits)

      log_prob = F.log_softmax(logits, dim=1)
      action_log_prob = log_prob.gather(1, action.view(-1,1))

      dist_entropy = dist.entropy().mean() #TODO

      values.append(value)
      action_log_probs.append(action_log_prob)
      dist_entropys.append(dist_entropy)
    return values, action_log_probs, dist_entropys 

  def get_values(self, obs, state): 
    values = []
    for agent, current_obs in zip(self.agents, obs):
      value = self.critic_network(state)
      values.append(value)

    return torch.stack(values)

  def compute_loss(self):
    obs_shape = self.rollouts.observations.size()[2:]
    state_shape = self.rollouts.state.size()[1:]
    action_shape = self.rollouts.actions.size()[-1]
    num_steps, num_processes, _ = self.rollouts.rewards.size() 

    losses, action_losses, value_losses, dist_entropys = [],[],[],[]

    values, action_log_probs, dist_entropys = self.evaluate_actions(
        self.rollouts.observations[:-1].view(NUM_AGENTS, -1, *obs_shape),
        self.rollouts.state[:-1], 
        self.rollouts.actions.view(NUM_AGENTS, -1))
    
    for agent, value, action_log_prob, dist_entropy, returns in zip(self.agents, values, action_log_probs, dist_entropys, self.rollouts.returns[:-1].view(NUM_AGENTS, -1)): # reshape to num_agents
      advantages = returns - value.view(num_steps, 1) 
      value_loss = advantages.pow(2).mean() 

      action_loss = -(advantages.detach() * action_log_prob.view(num_steps, 1)).mean()
      loss = action_loss + self.value_loss_weight * value_loss - self.entropy_loss_weight * dist_entropy

      losses.append(loss), action_losses.append(action_loss), value_losses.append(value_loss), dist_entropys.append(dist_entropy)
    state = self.rollouts.state[:-1]
    critic_loss = (self.rollouts.returns[:-1].mean(dim=1) - self.critic_network(state)).mean()

    return losses, action_losses, value_losses, dist_entropys, critic_loss

  def update(self): 
    losses, action_losses, value_losses, dist_entropys, critic_loss = self.compute_loss()

    for agent, loss, action_loss, value_loss, dist_entropy in zip(self.agents, losses, action_losses, value_losses, dist_entropys):
      agent.optimizer.zero_grad()
      loss.backward(retain_graph=True) 
      torch.nn.utils.clip_grad_norm_(agent.model.parameters(), self.grad_norm_max)
      agent.optimizer.step()
      self.save_loss(loss.item(), action_loss.item(), value_loss.item(), dist_entropy.item(), agent) #save updated loss


    critic_loss.backward(retain_graph=True)
    torch.nn.utils.clip_grad_norm_(self.critic_network.parameters(), self.grad_norm_max)
    self.optimizer.step()

    return value_losses, action_losses, dist_entropys, critic_loss

  def save_loss(self, loss, policy_loss, value_loss, entropy_loss, agent):
    if agent not in self.policy_losses:
      self.policy_losses[agent] = [policy_loss]
    else:
      self.policy_losses[agent].append(policy_loss)

    if agent not in self.value_losses:
      self.value_losses[agent] = [value_loss]
    else:
      self.value_losses[agent] = [value_loss]

    if agent not in self.entropy_losses:
      self.entropy_losses[agent] = [entropy_loss]
    else:
      self.entropy_losses[agent].append(entropy_loss)

Training Loop

In [13]:
def pretty_print(episode_idx, total_step_num, dist_entropy, value_loss, action_loss, critic_loss, steps):
  print(f"{episode_idx} | {episode_idx} / {ENV_STEPS+1}")
  print("-------------------------------------------------")
  idx = 0
  for e, v, a in zip(dist_entropy, value_loss, action_loss):
    print(f"Agent {idx}: Dist Entropy: {e} | Value Loss: {v} | Action Loss: {a}")
    idx += 1
  print("Steps Required to Solve:", steps)
  print("Critic Loss:", critic_loss)

In [14]:
if __name__=='__main__':

    env = UnknownTerrain()

    obs_shape = env.observation_space
    state_shape = env.state_space
    model = Model(static_policy=False, env=env)

    current_obs = torch.zeros(NUM_AGENTS, *obs_shape,
                    device=DEVICE, dtype=torch.float)
    current_state = torch.zeros(*state_shape,
                    device=DEVICE, dtype=torch.float)
    #create arbitary obs/state to start the training process, load data into those tensors once we have data
    def update_current_obs(obs):
      return torch.from_numpy(obs.astype(np.float32)).to(DEVICE)

    def update_current_state(state):
      return torch.from_numpy(state.astype(np.float32)).to(DEVICE)

    obs = env.reset() 
    current_obs = update_current_obs(obs)
    state = env.get_state_()
    current_state = update_current_state(state)

    model.rollouts.observations[0].copy_(current_obs)
    model.rollouts.state[0].copy_(current_state)#add into rollout storage
    
    episode_rewards = np.zeros(NUM_AGENTS, dtype=np.float32)
    final_rewards = np.zeros(NUM_AGENTS, dtype=np.float32)

    start=timer()

    print_step = 1
    print_threshold = 10
    best_steps_required = 100
    current_steps_required = 0
    
    for episode_idx in range(1, ENV_STEPS+1):
        for step in range(ROLLOUT):
            with torch.no_grad():
                values, actions, action_log_prob = model.get_action(model.rollouts.observations[step], model.rollouts.state[step]) 
            cpu_actions = actions.view(-1).cpu().numpy()#take it fr gpu put it into cpu convert to numpy, desont do anything if already on cpu

            obs, reward, done = env.step_(cpu_actions)# step env according to actions
            episode_rewards = np.add(reward, episode_rewards)#put rewards together
            masks = 1. - done.astype(np.float32)#take out rewards doesnt matter, mask is boolean
            final_rewards *= masks
            final_rewards = np.add(reward, (1. - masks) * episode_rewards)
            episode_rewards *= masks


            rewards = torch.from_numpy(reward.astype(np.float32)).view(-1, 1).to(DEVICE)
            masks = torch.from_numpy(masks).to(DEVICE).view(-1, 1)
            current_obs *= masks.view(-1, 1, 1)#remove observations fr terminal states
            current_obs = update_current_obs(obs)
            current_state = torch.from_numpy(env.get_state_().astype(np.float32)).to(DEVICE)

            if(np.all(done)):
              current_steps_required = env.step
              env.reset()
              #if in terminal state, reset and statrt again

              if current_steps_required < best_steps_required: #keep track of how fast agents solve it 
                best_steps_required = current_steps_required


            model.rollouts.insert(current_obs, actions.view(-1, 1), action_log_prob, values, rewards, masks, current_state)
            
        with torch.no_grad():
            next_value = model.get_values(model.rollouts.observations[-1], model.rollouts.state[-1]) #compute next value

        model.rollouts.compute_returns(next_value, GAMMA)#compute returns from next value
        value_loss, action_loss, dist_entropy, critic_loss = model.update()

        model.rollouts.after_update()

        if episode_idx % 100 == 0:
            try:
                # clear_output()
                end = timer()
                total_num_steps = (episode_idx + 1) * NUM_AGENTS * ROLLOUT
                '''
                print("Updates {}, Num Timesteps {}, FPS {},\nMean/Median Reward {:.1f}/{:.1f}, Min/Max Reward {:.1f}/{:.1f},\nEntropy {:.5f}, Value Loss {:.5f}, Policy Loss {:.5f}".
                format(episode_idx, total_num_steps,
                       int(total_num_steps / (end - start)),
                       np.mean(final_rewards),
                       np.median(final_rewards),
                       np.min(final_rewards),
                       np.max(final_rewards), dist_entropy,
                       value_loss, action_loss))
                '''
                pretty_print(episode_idx, total_num_steps, dist_entropy, value_loss, action_loss, critic_loss, best_steps_required)
            except IOError:
                pass

100 | 100 / 10001
-------------------------------------------------
Agent 0: Dist Entropy: 7.8451672949020335e-28 | Value Loss: 841959.875 | Action Loss: -284654.1875
Agent 1: Dist Entropy: 1.0884332656860352 | Value Loss: 324731.125 | Action Loss: -17992.58984375
Agent 2: Dist Entropy: 1.9007153511047363 | Value Loss: 30713.1640625 | Action Loss: -117.13890075683594
Steps Required to Solve: 100
Critic Loss: tensor(-506.3564, grad_fn=<MeanBackward0>)
200 | 200 / 10001
-------------------------------------------------
Agent 0: Dist Entropy: 0.0 | Value Loss: 281978.21875 | Action Loss: -779920.25
Agent 1: Dist Entropy: 2.0455002969210767e-10 | Value Loss: 39542.34765625 | Action Loss: -9994.6533203125
Agent 2: Dist Entropy: 1.3302711248397827 | Value Loss: 149684.703125 | Action Loss: 306.63232421875
Steps Required to Solve: 100
Critic Loss: tensor(-108.7857, grad_fn=<MeanBackward0>)
300 | 300 / 10001
-------------------------------------------------
Agent 0: Dist Entropy: 0.0 | Value L

KeyboardInterrupt: ignored

Training isn't occuring properly because the embedding layer isn't being updated according to triplet loss.