## PPO design notes

### Actor/critic networks design
### Env
-convert obs to proper obs

-reward calculation

In [1]:
import json
from IPython.display import display, Javascript
from luxai_s3.wrappers import LuxAIS3GymEnv, RecordEpisode
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter
import os
from my_agent.lux.utils import direction_to, direction_to_change
import matplotlib.pyplot as plt
import numpy as np
import random
from maps import EnergyMap, RelicMap, TileMap
from astar import *
import gymnasium as gym
from gymnasium.spaces import MultiDiscrete, Discrete, Tuple
from agent import Agent

In [2]:
a = torch.rand((16,3))
b = Categorical(logits = a)
c = b.sample()
d = b.log_prob(c)
print(a)
print(c)
print(d)

tensor([[0.7593, 0.4824, 0.9285],
        [0.1835, 0.5721, 0.5691],
        [0.5621, 0.8385, 0.9404],
        [0.4688, 0.9578, 0.5742],
        [0.5567, 0.1621, 0.4108],
        [0.1294, 0.0489, 0.4268],
        [0.6163, 0.6996, 0.3939],
        [0.7972, 0.6215, 0.9689],
        [0.9725, 0.7152, 0.0961],
        [0.0560, 0.6166, 0.4640],
        [0.3461, 0.8050, 0.1723],
        [0.5848, 0.8527, 0.0359],
        [0.3035, 0.7428, 0.8109],
        [0.4202, 0.3935, 0.4194],
        [0.8196, 0.2978, 0.9260],
        [0.5282, 0.8194, 0.3422]])
tensor([2, 0, 2, 2, 1, 0, 1, 2, 0, 1, 1, 1, 1, 1, 0, 2])
tensor([-0.9101, -1.3726, -0.9509, -1.2142, -1.3260, -1.1845, -0.9771, -0.9356,
        -0.7837, -0.8876, -0.7715, -0.7916, -0.9987, -1.1162, -0.9954, -1.3392])


In [3]:
 class ProxyAgent():
    def __init__(self, player: str, env_cfg) -> None:
        self.player = player
        self.opp_player = "player_1" if self.player == "player_0" else "player_0"
        self.team_id = 0 if self.player == "player_0" else 1
        self.opp_team_id = 1 if self.team_id == 0 else 0
        np.random.seed(0)
        self.env_cfg = env_cfg
        if self.player=="player_0":
            self.start_pos = [0,0]
            self.pnum = 1
        else:
            self.start_pos = [23,23]
            self.pnum = 0
        self.unit_explore_locations = dict()
        self.relic_node_positions = []
        self.discovered_relic_nodes_ids = set()
        self.n_units = self.env_cfg["max_units"]
        self.match_num = 1
        self.relic_map = RelicMap(self.n_units)
        self.tile_map = TileMap()
        self.energy_map = EnergyMap()
        self.move_cost = 3.0
        self.nebula_drain = 5.0
        self.move_check = 0
        self.nebula_check = 0
        
        self.range = self.env_cfg["unit_sensor_range"]
        self.sap_range = self.env_cfg["unit_sap_range"]
        self.sap_cost = self.env_cfg["unit_sap_cost"]
        self.width = self.env_cfg["map_width"]
        self.height = self.env_cfg["map_height"]
        
        self.unit_has_target = -np.ones((self.n_units))
        self.unit_targets = dict(zip(range(0,self.n_units), np.zeros((self.n_units,2))))
        self.unit_targets_previous = dict(zip(range(0,self.n_units), np.zeros((self.n_units,2))))
        self.unit_path = dict(zip(range(0,self.n_units), [[] for i in range(0,self.n_units)]))
        self.unit_moved = np.zeros((self.n_units))
        self.prev_points = 0
        self.prev_point_diff = 0
        self.prev_points_increase = 0
        self.prev_actions = None
        self.previous_energys = 100*np.zeros((self.n_units))
        self.previous_positions = -np.ones((self.n_units,2))


    def get_explore(self, current):
        a = np.stack((np.repeat(np.arange(24),24,axis=0).reshape((24,24)), np.repeat(np.arange(24),24,axis=0).reshape((24,24)).T),axis=2)
        a[current!=-1] = [100,100]
        self.explore_choices = a[np.sum(np.abs(a-np.array(self.start_pos)),axis=-1)<24-self.range].tolist()
        if self.explore_choices:
            return random.choice(self.explore_choices)
        else:
            x = np.random.randint(0,24)
            y = np.random.randint(0,24-x)
            return [abs(x-self.start_pos[0]), abs(y-self.start_pos[1])]

    def get_moves(self, obs, unit_id, unit_pos):
        prev_pos = [unit_pos[0] - direction_to_change(self.prev_actions[unit_id][0])[0], unit_pos[1] - direction_to_change(self.prev_actions[unit_id][0])[1]]
        new_pos = [[unit_pos[0], unit_pos[1]-1],
                  [unit_pos[0]+1, unit_pos[1]],
                  [unit_pos[0], unit_pos[1]+1],
                  [unit_pos[0]-1, unit_pos[1]]]
        moves = [0]
        for ii, pos in enumerate(new_pos):
            if pos[0]<0 or pos[1]<0 or pos[0]>=self.width or pos[1]>=self.height or (pos[0]==prev_pos[0] and pos[1]==prev_pos[1]) or obs["map_features"]["tile_type"][pos[0], pos[1]]==2 :
            #if pos[0]<0 or pos[1]<0 or pos[0]>23 or pos[1]>23 or obs["map_features"]["tile_type"][pos[0], pos[1]]==2:
                pass
            else:
                moves.append(direction_to(unit_pos, pos))
        #print(moves)
        return moves
        
    def reset(self):
        self.match_num += 1
        self.unit_has_target = -np.ones((self.n_units))
        self.unit_targets = dict(zip(range(0,self.n_units), np.zeros((self.n_units,2))))
        self.unit_targets_previous = dict(zip(range(0,self.n_units), np.zeros((self.n_units,2))))
        self.unit_path = dict(zip(range(0,self.n_units), [[] for i in range(0,self.n_units)]))
        self.unit_moved = np.zeros((self.n_units))
        self.prev_points = 0
        self.prev_point_diff = 0
        self.prev_points_increase = 0
        self.prev_actions = np.zeros((self.env_cfg["max_units"], 3), dtype=int)
        self.prev_energys = 100*np.ones((self.n_units))
        self.previous_positions = -np.ones((self.n_units,2))

    def compare_positions(self, pos1, pos2):
        return pos1[0]==pos2[0] and pos1[1]==pos2[1]
        
    # bunnyhop mechanic (maximize points by avoiding doubling on fragment)
    def bunnyhop(self, unit, unit_positions):
        counter = 0
        unit_pos = unit_positions[unit]
        for unit2 in range(self.n_units):            
            if self.unit_has_target[unit2]==2 and self.tile_map.map[unit_positions[unit2][0],unit_positions[unit2][1]]!=2 and len(self.unit_path[unit])>1 and self.compare_positions(self.unit_path[unit][0],unit_positions[unit2]):
                self.unit_path[unit2] = self.unit_path[unit][1:]
                self.unit_targets[unit2] = self.unit_targets[unit]
                self.unit_has_target[unit2] = 1#self.unit_has_target[unit]
                self.unit_path[unit] = [unit_positions[unit2]]
                self.unit_targets[unit] = unit_positions[unit2]
                self.unit_has_target[unit] = 1
                counter +=1
                if counter<10:
                    self.bunnyhop(unit2, unit_positions)

    def positions_to_map(self, unit_positions):
        unit_map = np.zeros((24,24))
        for unit in unit_positions:
            if unit[0]!=-1 and unit[1]!=-1:
                unit_map[unit[0],unit[1]] = 1
        return unit_map

    # adjust for not only direct hits, but adjacent hits
    def check_hit(self, target):
        for pos in self.enemy_positions:
            if pos[0]!=-1 and pos[1]!=-1:
                if pos[0]==target[0] and pos[1]==target[1]:
                    return 1
        else:
            return 0

    def get_init_proxy_obs(self, obs):
         return (np.array([np.zeros((24,24),dtype=int) for i in range(6)]),np.array([0,0,0]))
     
    def step(self, step, obs):
        reward = 0
        unit_mask = np.array(obs["units_mask"][self.team_id]) # shape (max_units, )
        self.unit_positions = np.array(obs["units"]["position"][self.team_id]) # shape (max_units, 2)
        self.enemy_positions = np.array(obs["units"]["position"][abs(self.team_id-1)]).tolist()
        self.unit_energys = np.array(obs["units"]["energy"][self.team_id]) # shape (max_units, 1)
        observed_relic_node_positions = np.array(obs["relic_nodes"]) # shape (max_relic_nodes, 2)
        observed_relic_nodes_mask = np.array(obs["relic_nodes_mask"]) # shape (max_relic_nodes, )
        team_points = np.array(obs["team_points"]) # points of each team, team_points[self.team_id] is the points of the your team
        increase = team_points[self.team_id]-self.prev_points
        diff = team_points[self.team_id] - team_points[abs(self.team_id-1)]
        diff_change = diff-self.prev_point_diff
        self.prev_point_diff = diff
        # ids of units you can control at this timestep
        current_tile_map = obs["map_features"]["tile_type"]
        current_energy_map = obs["map_features"]["energy"]
        ### proxy reward calculation ###
        # change in point difference 
        reward += diff
        # units on known fragment tiles
        for unit in range(self.n_units):
            pos = self.unit_positions
            if pos[0]!=-1 and pos[1]!=-1:
                if self.relic_map.map_knowns[pos[0],pos[1]]==1:
                    reward += 1
                # units targeting possibles/known fragments
                t = self.unit_targets[unit]
                if self.relic_map.map_knowns[t[0],t[1]]==1 or self.relic_map.map_possibles[t[0],t[1]]==1:
                    reward += 1
            # unit dies (negative reward)
            else: 
                if self.unit_moved[unit]:
                    reward += -1
            # hit enemy
            action = self.prev_actions[unit]
            if action[0]==5:
                reward += self.check_hit(action[1:])
            
            
        
        if step in [102,203,304,405]:
            self.reset()
            
        # visible relic nodes
        visible_relic_node_ids = set(np.where(observed_relic_nodes_mask)[0])
        # save any new relic nodes that we discover for the rest of the game.
        for ii in visible_relic_node_ids:
            if ii not in self.discovered_relic_nodes_ids:
                # explore units switch to relic collection
                self.relic_map.new_relic(observed_relic_node_positions[ii])
                self.discovered_relic_nodes_ids.add(ii)
                self.discovered_relic_nodes_ids.add((ii+3)%6)
                self.relic_node_positions.append(observed_relic_node_positions[ii])
        # update maps
        self.available_unit_ids = np.where(unit_mask)[0].tolist()
        self.relic_map.step(unit_positions, increase)
        tile_shift = self.tile_map.update(current_tile_map)
        energy_shift = self.energy_map.update(current_energy_map)        

        # find out move cost
        if step>2 and not self.move_check and self.tile_map.map[unit_positions[0][0],unit_positions[0][1]]!=1 and self.unit_moved[0]:
            self.move_cost=self.previous_energys[0]-unit_energys[0]+self.energy_map.map[unit_positions[0][0],unit_positions[0][1]]
            self.move_check=1
        # find out nebula drain
        if not self.nebula_check and self.move_check:
            for unit in available_unit_ids:
                if self.unit_moved[unit] and  self.tile_map.map[unit_positions[unit][0],unit_positions[unit][1]]==1:
                    self.nebula_check=1
                    self.nebula_drain = -(unit_energys[unit]-self.previous_energys[unit]-self.energy_map.map[unit_positions[unit][0],unit_positions[unit][1]]+self.move_cost)
                    break
        tiles = np.ones(24,24)
        tiles[self.tile_map.map==2] = 0
        energy = self.energy_map.map.copy()
        energy[self.tile_map.map==1] = energy[self.tile_map.map==1] - self.nebula_drain
        my_unit_map = self.positions_to_map(self.unit_positions)
        enemy_unit_map = self.positions_to_map(self.enemy_positions)
        proxy_obs = (np.array(tiles, energy, self.relic_map.map_possibles, self.relic_map.map_knowns, my_unit_map, enemy_unit_map), np.array([step, diff, np.sum(self.unit_energys)]))
        return proxy_obs, reward
        
    def act(self, proxy_action):
        actions = np.zeros((self.n_units, 3), dtype=int)
        for unit in self.available_unit_ids:
            if proxy_action[unit][0]!=5:
                self.unit_targets[unit] = [proxy_action[unit][1]],proxy_action[unit][2]
                '''if not self.compare_positions(self.unit_targets[unit], self.unit_targets_previous[unit]):
                    path, _ = a_star(unit_positions[unit], self.unit_targets[unit], self.tile_map.map, self.energy_map.map, self.relic_map.map_knowns, self.move_cost, self.nebula_drain, use_energy=False)
                    self.unit_path[unit] = path[1:]'''
                direction = direction_to(self.unit_positions[unit], self.unit_targets[unit])
                change = direction_to_change(direction)
                self.unit_path[unit] = [self.unit_positions[unit][0]+change[0],self.unit_positions[unit][1]+change[1]]
                    
        discover_flag = 0
        # Decide on action. Follow path, if multiple units want to move to possible fragment only let one through, if attacking fire on enemy instead of moving
        for unit in self.available_unit_ids:
            unit_pos = self.unit_positions[unit]
            self.bunnyhop(unit, self.unit_positions)
            
        for unit in self.available_unit_ids:
            unit_pos = self.unit_positions[unit]
            if proxy_action[unit][0]==5:
                actions[unit] = [5,unit_pos[0]-proxy_action[unit][1],unit_pos[1]-proxy_action[unit][2]]
            else:
                if unit_energys[unit]<self.move_cost:
                    actions[unit]=[0,0,0]
                elif self.unit_path[unit]:
                    if self.relic_map.map_possibles[self.unit_path[unit][0][0],self.unit_path[unit][0][1]]==1:
                        if discover_flag:
                            if self.relic_map.map_possibles[unit_pos[0],unit_pos[1]]==1:
                                actions[unit] = self.relic_map.move_away(self.tile_map.map, [unit_pos[0],unit_pos[1]])
                                self.unit_path[unit].insert(0, unit_pos)
                            else:
                                actions[unit]=[0,0,0]
                        else:
                            actions[unit] = [direction_to(unit_pos, self.unit_path[unit].pop(0)), 0, 0]
                            discover_flag=1
                    else:
                        actions[unit] = [direction_to(unit_pos, self.unit_path[unit].pop(0)), 0, 0]
                else:
                    if self.relic_map.map_possibles[unit_pos[0],unit_pos[1]]==1:
                        if discover_flag:
                            actions[unit] = self.relic_map.move_away(self.tile_map.map, [unit_pos[0],unit_pos[1]])
                            self.unit_path[unit].insert(0, unit_pos)
                        else:
                            actions[unit]=[0,0,0]
                            discover_flag = 1
                    else:
                        actions[unit]=[0,0,0]
        self.unit_targets_previous = self.unit_targets
        self.previous_energys = unit_energys
        self.relic_map.map_occupied = np.zeros((24,24))
        self.prev_points = team_points[self.team_id]
        self.prev_points_increase = increase
        self.prev_actions = actions
        self.previous_positions = unit_positions
        return actions

In [4]:
map_space = Tuple((
            MultiDiscrete(np.full((24,24),24)),
            MultiDiscrete(np.full((24,24),24)),
            MultiDiscrete(np.full((24,24),24)),
            MultiDiscrete(np.full((24,24),24)),
            MultiDiscrete(np.full((24,24),24)),
            MultiDiscrete(np.full((24,24),24)),
        ))
param_space = MultiDiscrete(np.array([505, 1000, 16*400]))
observation_space = Tuple((map_space, param_space))
print((observation_space[1]).shape)

(3,)


In [42]:
def env_fn():
    return ProxyEnvironment()

class ProxyEnvironment(gym.Env):
    def __init__(self):
        self.n_maps = 6
        self.n_state_params = 3
        self.map_space = Tuple((
            MultiDiscrete(np.full((24,24),24)),
            MultiDiscrete(np.full((24,24),24)),
            MultiDiscrete(np.full((24,24),24)),
            MultiDiscrete(np.full((24,24),24)),
            MultiDiscrete(np.full((24,24),24)),
            MultiDiscrete(np.full((24,24),24)),
        ))
        self.param_space = MultiDiscrete(np.array([505, 1000, 16*400]))
        self.observation_space = Tuple((self.map_space, self.param_space))
        self.action_space = MultiDiscrete(np.array([5 for i in range(16)]))
        self.env = RecordEpisode(LuxAIS3GymEnv(numpy_output=True), save_on_close=False, save_on_reset=False, save_dir="replays")
        self.obs, info  = self.env.reset()
        self.agent1 = ProxyAgent("player_0", info["params"])
        self.agent2 = Agent("player_1", info["params"])
        self.current_step = 0

    def close(self):
        self.env.close()
        
    def reset(self, seed, options):
        self.current_step = 0
        self.obs, info = self.env.reset(seed=seed)
        self.agent1 = ProxyAgent("player_0", info["params"])
        self.agent2 = Agent("player_1", info["params"])
        self.proxy_obs = self.agent1.get_init_proxy_obs(self.obs)
        return self.proxy_obs, info

    def step(self, proxy_action):
        self.current_step += 1
        actions = dict()
        actions["player_0"] = self.agent1.act(proxy_action)
        actions["player_1"] = self.agent2.act(step=self.current_step, obs=self.obs[agent.player])
        self.obs, reward, terminated, truncated, info = env.act(actions)
        self.proxy_obs, self.proxy_reward = env.step(self.obs, self.current_step)
        dones = {k: terminated[k] | truncated[k] for k in terminated}
        if dones["player_0"] or dones["player_1"]:
            game_done = True
        return self.proxy_obs, self.proxy_reward, terminated, truncated, info

In [43]:
class Args:
    exp_name: str =""
    """the name of this experiment"""
    seed: int = 1
    """seed of the experiment"""
    torch_deterministic: bool = True
    """if toggled, `torch.backends.cudnn.deterministic=False`"""
    cuda: bool = False
    """if toggled, cuda will be enabled by default"""
    capture_video: bool = False
    """whether to capture videos of the agent performances (check out `videos` folder)"""

    # Algorithm specific arguments
    env_id: str = "CartPole-v1"
    """the id of the environment"""
    total_timesteps: int = 500000
    """total timesteps of the experiments"""
    learning_rate: float = 2.5e-4
    """the learning rate of the optimizer"""
    num_envs: int = 4
    """the number of parallel game environments"""
    num_steps: int = 128
    """the number of steps to run in each environment per policy rollout"""
    anneal_lr: bool = True
    """Toggle learning rate annealing for policy and value networks"""
    gamma: float = 0.99
    """the discount factor gamma"""
    gae_lambda: float = 0.95
    """the lambda for the general advantage estimation"""
    num_minibatches: int = 4
    """the number of mini-batches"""
    update_epochs: int = 4
    """the K epochs to update the policy"""
    norm_adv: bool = True
    """Toggles advantages normalization"""
    clip_coef: float = 0.2
    """the surrogate clipping coefficient"""
    clip_vloss: bool = True
    """Toggles whether or not to use a clipped loss for the value function, as per the paper."""
    ent_coef: float = 0.01
    """coefficient of the entropy"""
    vf_coef: float = 0.5
    """coefficient of the value function"""
    max_grad_norm: float = 0.5
    """the maximum norm for the gradient clipping"""
    target_kl: float = None
    """the target KL divergence threshold"""

    # to be filled in runtime
    batch_size: int = 0
    """the batch size (computed in runtime)"""
    minibatch_size: int = 0
    """the mini-batch size (computed in runtime)"""
    num_iterations: int = 0
    """the number of iterations (computed in runtime)"""

In [44]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer


# TODO network design
class ActorCritic(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.n_ens = env.observation_space[1].shape[0]
        n_maps = len(env.single_observation_space[0])
        n_state_params = env.single_observation_space[1].shape[0]
        print(n_state_params)
        self.map_to_hidden = nn.Sequential(
            nn.Conv2d(n_maps, 12, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(12, 6, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            layer_init(nn.Linear(6*6*6, 128)),
            nn.ReLU()
        )
        self.state_params_to_hidden = nn.Sequential(
            layer_init(nn.Linear(n_state_params, 64)),
            nn.ReLU(),
        )
        self.map_and_state_params_combine = nn.Sequential(
            layer_init(nn.Linear(128 + 64, 64)),
            nn.ReLU(),
        )
        
        self.actor = nn.Sequential(
            layer_init(nn.Linear(64, 128)),
            nn.ReLU(),
            layer_init(nn.Linear(128, 16*5)),
            nn.ReLU(),
        )

        self.critic = nn.Sequential(
            layer_init(nn.Linear(64, 16)),
            nn.ReLU(),
            layer_init(nn.Linear(16, 1)),
        )

    def combine(self, x):
        maps, state_params = x
        map_hidden = self.map_to_hidden(maps)
        state_params_hidden = self.state_params_to_hidden(state_params)
        return self.map_and_state_params_combine(torch.cat((map_hidden, state_params_hidden), dim=-1))
    
    def get_value(self, x):
        return self.critic(self.combine(x))

    def get_action_and_value(self, x, action=None):
        x = self.combine(x)
        logits = self.actor(x).reshape(self.n_ens, 16,5)
        #print(logits)
        probs = Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(x)

In [45]:
def train(exp_name, args):
    args.exp_name = exp_name
    args.batch_size = int(args.num_envs * args.num_steps)
    args.minibatch_size = int(args.batch_size // args.num_minibatches)
    args.num_iterations = args.total_timesteps // args.batch_size
    run_name = f"{args.exp_name}__{args.seed}__{int(time.time())}"
    writer = SummaryWriter(f"runs/{run_name}")
    writer.add_text(
        "hyperparameters",
        "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
    )

    # TRY NOT TO MODIFY: seeding
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = args.torch_deterministic

    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

    # env setup
    envs = gym.vector.SyncVectorEnv(
        [env_fn for i in range(args.num_envs)],
    )

    agent = ActorCritic(envs).to(device)
    optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)
    # ALGO Logic: Storage setup
    obs = (torch.zeros((args.num_steps, args.num_envs) + np.array(envs.single_observation_space[0]).shape).to(device),torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space[1].shape).to(device))
    actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
    logprobs = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
    rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
    dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
    values = torch.zeros((args.num_steps, args.num_envs)).to(device)

    # TRY NOT TO MODIFY: start the game
    global_step = 0
    start_time = time.time()
    next_obs, _ = envs.reset(seed=args.seed)
    next_obs = (torch.Tensor(next_obs[0]).to(device).reshape((args.num_envs,) + np.array(envs.single_observation_space[0]).shape),
                torch.Tensor(next_obs[1]).reshape((args.num_envs,)+envs.single_observation_space[1].shape).to(device))
    next_done = torch.zeros(args.num_envs).to(device)
    for iteration in range(1, args.num_iterations + 1):
        # Annealing the rate if instructed to do so.
        if args.anneal_lr:
            frac = 1.0 - (iteration - 1.0) / args.num_iterations
            lrnow = frac * args.learning_rate
            optimizer.param_groups[0]["lr"] = lrnow

        for step in range(0, args.num_steps):
            global_step += args.num_envs
            obs[0][step] = next_obs[0]
            obs[1][step] = next_obs[1]
            dones[step] = next_done
            # ALGO LOGIC: action logic
            with torch.no_grad():
                action, logprob, _, value = agent.get_action_and_value(next_obs)
                values[step] = value.flatten()
            actions[step] = action
            logprobs[step] = logprob

            # TRY NOT TO MODIFY: execute the game and log data.
            next_obs, reward, terminations, truncations, infos = envs.step(action.cpu().numpy())
            next_done = np.logical_or(terminations, truncations)
            rewards[step] = torch.tensor(reward).to(device).view(-1)
            next_obs = (torch.Tensor(next_obs[0]).to(device).reshape((args.num_envs,) + np.array(envs.single_observation_space[0]).shape),
                        torch.Tensor(next_obs[1]).reshape((args.num_envs,)+envs.single_observation_space[1].shape).to(device))
            next_done = torch.Tensor(next_done).to(device)

            if "final_info" in infos:
                for info in infos["final_info"]:
                    if info and "episode" in info:
                        print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
                        writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
                        writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)

        # bootstrap value if not done
        with torch.no_grad():
            next_value = agent.get_value(next_obs).reshape(1, -1)
            advantages = torch.zeros_like(rewards).to(device)
            lastgaelam = 0
            for t in reversed(range(args.num_steps)):
                if t == args.num_steps - 1:
                    nextnonterminal = 1.0 - next_done
                    nextvalues = next_value
                else:
                    nextnonterminal = 1.0 - dones[t + 1]
                    nextvalues = values[t + 1]
                delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
                advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam
            returns = advantages + values

        # flatten the batch
        b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
        b_logprobs = logprobs.reshape(-1)
        b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
        b_advantages = advantages.reshape(-1)
        b_returns = returns.reshape(-1)
        b_values = values.reshape(-1)

        # Optimizing the policy and value network
        b_inds = np.arange(args.batch_size)
        clipfracs = []
        for epoch in range(args.update_epochs):
            np.random.shuffle(b_inds)
            for start in range(0, args.batch_size, args.minibatch_size):
                end = start + args.minibatch_size
                mb_inds = b_inds[start:end]

                _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions.long()[mb_inds])
                logratio = newlogprob - b_logprobs[mb_inds]
                ratio = logratio.exp()

                with torch.no_grad():
                    # calculate approx_kl http://joschu.net/blog/kl-approx.html
                    old_approx_kl = (-logratio).mean()
                    approx_kl = ((ratio - 1) - logratio).mean()
                    clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]

                mb_advantages = b_advantages[mb_inds]
                if args.norm_adv:
                    mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

                # Policy loss
                pg_loss1 = -mb_advantages * ratio
                pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef)
                pg_loss = torch.max(pg_loss1, pg_loss2).mean()

                # Value loss
                newvalue = newvalue.view(-1)
                if args.clip_vloss:
                    v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
                    v_clipped = b_values[mb_inds] + torch.clamp(
                        newvalue - b_values[mb_inds],
                        -args.clip_coef,
                        args.clip_coef,
                    )
                    v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
                    v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                    v_loss = 0.5 * v_loss_max.mean()
                else:
                    v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()

                entropy_loss = entropy.mean()
                loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef

                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
                optimizer.step()

            if args.target_kl is not None and approx_kl > args.target_kl:
                break

        y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
        var_y = np.var(y_true)
        explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

        # TRY NOT TO MODIFY: record rewards for plotting purposes
        writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
        writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
        writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
        writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
        writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
        writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
        writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
        writer.add_scalar("losses/explained_variance", explained_var, global_step)
        print("SPS:", int(global_step / (time.time() - start_time)))
        writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)

    envs.close()
    writer.close()


In [46]:
name = "basic_test"
train(name, Args)

3


  gym.logger.warn(


AttributeError: 'ProxyAgent' object has no attribute 'available_unit_ids'