In [2]:
import torch
import os 

import traci
import sumo_rl
os.environ['SUMO_HOME'] = '/opt/homebrew/opt/sumo/share/sumo'


In [3]:
import torch.nn as nn

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


## Construct environment

In [6]:
from sumo_rl.environment.env import env, parallel_env, SumoEnvironment
from ray.tune import register_env
from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv
from ray.rllib.env.wrappers.multi_agent_env_compatibility import MultiAgentEnvCompatibility

from environment.envs import RealMultiAgentSumoEnv
from environment.observation import Grid2x2ObservationFunction, EntireObservationFunction
from environment.reward_functions import combined_reward

env_folder = "data/2x2grid"

multi_agent_env = RealMultiAgentSumoEnv(    
        net_file = os.path.join(env_folder, "2x2.net.xml"),
        route_file = os.path.join(env_folder, "2x2.rou.xml"),
        reward_fn = combined_reward,
        observation_class = EntireObservationFunction, 
        out_csv_name="outputs/2x2grid/ppo", 
        num_seconds=1000,
        add_per_agent_info=True,
        add_system_info=True,
        single_agent=False)

rllib_parallel_petting_env = ParallelPettingZooEnv(multi_agent_env)   # ParallelPettingZoo is a wrapper from rrlib, 
                                                                # that wraps an env into rrlib compatible one, it simplifies the API 

 Retrying in 1 seconds
Step #0.00 (0ms ?*RT. ?UPS, TraCI: 14ms, vehicles TOT 0 ACT 0 BUF 0)                     
 Retrying in 1 seconds


In [9]:
register_env(name='hjkh', env_creator= lambda config : rllib_parallel_petting_env)
import ray
ray.shutdown()

In [13]:
rllib_parallel_petting_env.get_agent_ids()
ray.shutdown()

In [147]:
multi_agent_env.unwrapped.observation_space('1')
multi_agent_env.unwrapped.env.observation_spaces('2')
# multi_agent_env.unwrapped.env.traffic_signals['2'].num_green_phases

Box(0.0, 1.0, (84,), float32)

# Policy network

In [148]:
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch 

# inputs should be observation_function - size is 84 size array 
# because each intersection has 8 main lanes, 8 * 2 + (1 + 4) = 21. 4 ts so we have 84 ... 
# 4 stands for a one-hot encoder for the current traffic phase - each item in it needs to be a separate feature to be injected into the ANN
# Use Softmax activation function to chose between the 4 actions  
# We want every single agent in network to train its own model - each learns its own policy. 

class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.flatten = nn.Flatten()
        self.soft_max = nn.Softmax()
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        self.linear_network = nn.Sequential(
            nn.Linear(self.state_dim,100), # 2 - 3 layers. 84 neurones because each policy absorbs the entire observation space 
            nn.ReLU(),
            nn.Linear(100,100), # 100 neurones to start with - neurones should be approx within range of no. features
            nn.ReLU(),
            nn.Linear(100,self.action_dim), # 4 represents the phases each intersection entails. 
        )

    def forward(self, x):
        if type(x)==dict:
            arr = np.array(x.values())
            x = torch.from_numpy(arr)   
            print(x)
        output = self.linear_network(x)
        logits = self.soft_max(output)
        return logits

# Value network

In [149]:
class ValueNetwork(nn.Module):
    def __init__(self, state_dim):
        super().__init__()
        self.flatten = nn.Flatten()
        self.soft_max = nn.Softmax()
        self.state_dim = state_dim
        
        self.linear_network = nn.Sequential(
            nn.Linear(self.state_dim,100), # 2 - 3 layers. 84 neurones because each policy absorbs the entire observation space 
            nn.ReLU(),
            nn.Linear(100,100), # 100 neurones to start with - neurones should be approx within range of no. features
            nn.ReLU(),
            nn.Linear(100,1), # outputs the value of being in a particular state 
        )

    def forward(self, x):
        if type(x)==dict:
            arr = np.array(x.values())
            x = torch.from_numpy(arr)   
            print(x)
        output = self.linear_network(x)
        return output 

## Main loop

In [150]:
multi_agent_env.aec_env.unwrapped.env.traffic_signals['1'].green_phases

[Phase(duration=60, state='GGrrrrGGrrrr', minDur=-1, maxDur=-1),
 Phase(duration=60, state='rrGrrrrrGrrr', minDur=-1, maxDur=-1),
 Phase(duration=60, state='rrrGGrrrrGGr', minDur=-1, maxDur=-1),
 Phase(duration=60, state='rrrrrGrrrrrG', minDur=-1, maxDur=-1)]

In [151]:
def sample_action_from_policy(policy_network:PolicyNetwork, obs:torch.Tensor):
    ''''Samples action from env, after 1 pass from policy network, using argmax to pick'''
    pred_prob = policy_network(obs) # forward propagate the network 
    return pred_prob.argmax()

def convert_arr_to_tensor(obs:np.ndarray):
    return torch.from_numpy(obs)

def get_agent_observation_as_tensor(all_agents_obs:dict, agent_id:str):
    '''Takes in entire observations, returning observations for particular agent in tensor form'''
    agent_obs = all_agents_obs[agent_id]
    return convert_arr_to_tensor(agent_obs)

In [152]:
agent_ids = multi_agent_env.possible_agents

def instantiate_policies(agent_ids):
    policies = {id:PolicyNetwork() for id in agent_ids}
    return policies

In [153]:
def _generate_trajectory(multi_agent_env, all_observations:dict, timesteps:int) -> tuple: 
    '''Expecting observations for all agents from multi-agent environment setup.
    Ensure to pass in the current observations of agents in env'''
    
    observations = all_observations

    num_agents = multi_agent_env.num_agents

    observation_trajectories = torch.zeros(num_agents, timesteps, len(observations['1'])) 
    action_trajectories = torch.zeros(num_agents, timesteps)
    reward_trajectories = torch.zeros(num_agents, timesteps)

    for t in range(timesteps):

        agents_actions = {agent_id:None for agent_id in agent_ids}

        for i, id in enumerate(agent_ids):
            # get immediate action from policy network
            agent_obs = get_agent_observation_as_tensor(observations, agent_id=id)
            action = sample_action_from_policy(agent_policies[id], agent_obs) 

            observation_trajectories[i][t] = agent_obs
            action_trajectories[i][t] = action

            agents_actions[id]= int(action) # update this, as next it will go in the step() func

        # print(agents_actions)

        observations, rewards, terminations, truncations, infos = multi_agent_env.step(agents_actions) # takes in a dictionary of all agents + their corresponding actions
        
        for i, id in enumerate(agent_ids):
            reward_trajectories[i][t] = rewards[id]

    return observation_trajectories, action_trajectories, reward_trajectories
        

In [154]:
observations, infos = multi_agent_env.reset()
o, a, r = _generate_trajectory(multi_agent_env, observations, 10)

observations, infos = multi_agent_env.reset()
rewards = False 

agent_ids = multi_agent_env.possible_agents
agent_policies = instantiate_policies(agent_ids)

import matplotlib.pyplot as plt 

print(np.shape(o))
print(np.shape(a))
print(np.shape(r))

 Retrying in 1 seconds


NameError: name 'agent_policies' is not defined

In [None]:
from typing import Tuple

def generate_episodes(multi_agent_env_fn, no_episodes, no_timesteps, value_networks: list[ValueNetwork]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    '''Generate multiple episodes returning obs, actions, rewards, advantages tensors over all the episodes'''

    num_agents = multi_agent_env_fn.num_agents

    # initialise advantage trajectories
    episodic_advantages_trajectories = torch.zeros(no_episodes, num_agents, no_timesteps)
    episodic_obs_trajectories = torch.zeros(no_episodes, num_agents, no_timesteps, 84)
    episodic_reward_trajectories = torch.zeros(no_episodes, num_agents, no_timesteps)
    episodic_action_trajectories = torch.zeros(no_episodes, num_agents, no_timesteps)

    for ep_i in range(no_episodes): 
        observations_trajec, actions_trajec, rewards_trajec = _generate_trajectory(multi_agent_env, observations, no_timesteps)

        for agent_i, agent_id in enumerate(multi_agent_env.agents):
            
            reversed_returns = np.zeros((no_timesteps))
            reversed_value_baseline =  np.zeros((no_timesteps)) # do we need separate vectors for returns or baseline, or keep just one vector

            reversed_advantages = np.zeros((no_timesteps))

            running_returns = 0

            for t in reversed(range(no_timesteps)): # be careful - t is reversed here 
                
                returns = rewards_trajec[agent_i][t]
                running_returns += returns # we have not implemented a 1 step return, currently it consists of entire return. 

                agent_observations_t = observations_trajec[agent_i][t]
                agent_value_func_t = value_networks[agent_i](agent_observations_t) # simple forward pass in network to calculate value of state. 

                advantage = running_returns - agent_value_func_t

                reversed_advantages[t] = advantage

            advantages = reversed_advantages[::-1]

            for t in range(no_timesteps): 
                episodic_advantages_trajectories[ep_i][agent_i][t] = advantages[t] # reverse the array
            
        episodic_obs_trajectories[ep_i] = observations_trajec
        episodic_action_trajectories[ep_i] = actions_trajec
        episodic_reward_trajectories[ep_i] = rewards_trajec
    
    return episodic_obs_trajectories, episodic_action_trajectories, episodic_reward_trajectories, episodic_advantages_trajectories

In [None]:
from torch.utils.data import TensorDataset, DataLoader

num_agents = multi_agent_env.num_agents

value_networks = [ValueNetwork() for agent in range(num_agents)]

observations, info = multi_agent_env.reset()

episodic_obs, episodic_action, episodic_reward, episodic_advantages = generate_episodes(multi_agent_env, 5, 10, value_networks) 

# create batches for agent 0
agent_obs_ep0 = episodic_obs[0][0] # outputs [10, 84]
agent_obs_ep1 = episodic_obs[1][0] 
agent_obs_ep2 = episodic_obs[2][0] 
agent_obs_ep3 = episodic_obs[3][0]  

batched_agent_obs = episodic_obs[2][0][0:5] # outputs obs between 0-5 timesteps 
batched_agent_obs = episodic_obs[3][0][0:5] # outputs obs between 0-5 timesteps 

agent_0_dataset = TensorDataset(episodic_obs[:][0], episodic_action[:][0], episodic_reward[:][0], episodic_advantages[:][0])

Step #50.00 (0ms ?*RT. ?UPS, TraCI: 265ms, vehicles TOT 16 ACT 15 BUF 0)                  
 Retrying in 1 seconds


  return self._call_impl(*args, **kwargs)


In [220]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
from copy import deepcopy

class PPO:
    def __init__(self, multi_agent_par_env, state_dim, action_dim, lr=1e-3, discount=0.99, clip_epsilon=0.2):
        self.agents_neuralnetwork = [self._init_agent(state_dim, action_dim, lr) for _ in range(multi_agent_par_env.num_agents)]
        self.discount = discount
        self.clip_epsilon = clip_epsilon
        self.multi_agent_env = multi_agent_par_env
        # self.entropy_coeff

        self.agent_ids = self.multi_agent_env.possible_agents
        self.num_agents = self.multi_agent_env.num_agents

        # copy the neural network 
        # self.agents_neuralnetwork_old = [{"policy": self.agents_neuralnetwork[i]['policy'], "value": self.agents_neuralnetwork[i]['value'], \
        #                                  "policy_opt": self.agents_neuralnetwork[i]['policy_opt'], "value_opt": self.agents_neuralnetwork[i]['value_opt']}\
        #                                  for i in range(self.num_agents)]

    def _init_agent(self, state_dim, action_dim, lr):
        policy_net = PolicyNetwork(state_dim, action_dim)
        value_net = ValueNetwork(state_dim)
        policy_optimizer = optim.Adam(policy_net.parameters(), lr=lr)
        value_optimizer = optim.Adam(value_net.parameters(), lr=lr)
        return {"policy": policy_net, "value": value_net, "policy_opt": policy_optimizer, "value_opt": value_optimizer}

    def generate_actions(self, observations:dict):
        '''Takes in observations of all agents, returning the action vectors for use in environment'''
        return {id: action for i, (id, action) in enumerate(zip(self.agent_ids, self.agents_neuralnetwork[i]['policy'](observations)))}

    def sample_policy_action(self, agent_id:int, observations):
        return self.agents_neuralnetwork[agent_id]['policy'](observations)

    def generate_value(self, agent_id, agent_observations:torch.Tensor) -> torch.Tensor: 
        return self.agents_neuralnetwork['value'](agent_observations)

    def generate_values(self, observations:torch.Tensor): 
        ''''Generates values for all agents when given all observations'''
        return np.array([self.generate_values(id, agent_observations) for id, agent_observations in enumerate(self.agent_ids, observations)])

    def compute_advantage(self, agentID): 
        pass 

    # def _generate_trajectory(self, observations:dict, timesteps:int) -> tuple:
    #     '''Expecting observations for all agents from multi-agent parallel environment setup.
    #     Ensure to pass in the current observations of all agents in env'''

    #     observation_trajectories = torch.zeros(self.num_agents, timesteps, len(observations['1'])) 
    #     action_trajectories = torch.zeros(self.num_agents, timesteps)
    #     reward_trajectories = torch.zeros(self.num_agents, timesteps)
    #     pred_prob_trajectories = torch.zeros(self.num_agents, timesteps)

    #     for t in range(timesteps):

    #         agents_actions = {agent_id:None for agent_id in self.agent_ids} # initialise

    #         for i, id in enumerate(self.agent_ids):
    #             # get immediate action from policy network
    #             agent_obs = get_agent_observation_as_tensor(observations, agent_id=id)
    #             pred_probs = self.sample_policy_action(id, agent_obs) # Each agent will sample from its own policy
    #             action = pred_probs.arg_max

    #             observation_trajectories[i][t] = agent_obs
    #             action_trajectories[i][t] = action
    #             pred_prob_trajectories[i][t] = pred_probs.max()

    #             agents_actions[id]= int(action) # update this, as next it will go in the step() func

    #         observations, rewards, terminations, truncations, infos = self.multi_agent_par_env.step(agents_actions) # takes in a dictionary of all agents + their corresponding actions
            
    #         for i, id in enumerate(self.agent_ids):
    #             reward_trajectories[i][t] = rewards[id]

    #     return observation_trajectories, action_trajectories, reward_trajectories, pred_prob_trajectories
    
    def _generate_trajectory_np(self, observations:dict, timesteps:int) -> tuple:
        '''Expecting observations for all agents from multi-agent parallel environment setup.
        Ensure to pass in the current observations of all agents in env'''

        observation_trajectories = np.zeros((self.num_agents, timesteps, len(observations['1'])))
        action_trajectories = np.zeros((self.num_agents, timesteps))
        reward_trajectories = np.zeros((self.num_agents, timesteps))
        pred_prob_trajectories = np.zeros((self.num_agents, timesteps))

        for t in range(timesteps):

            agents_actions = {agent_id:None for agent_id in self.agent_ids} # initialise

            for i, id in enumerate(self.agent_ids):
                # get immediate action from policy network
                agent_obs = get_agent_observation_as_tensor(observations, agent_id=id)
                pred_probs = self.sample_policy_action(id, agent_obs) # Each agent will sample from its own policy
                action = pred_probs.arg_max()

                observation_trajectories[i][t] = agent_obs.numpy()
                action_trajectories[i][t] = action.numpy()
                pred_prob_trajectories[i][t] = pred_probs.max().numpy()

                agents_actions[id]= int(action) # update this, as next it will go in the step() func

            observations, rewards, terminations, truncations, infos = self.multi_agent_par_env.step(agents_actions) # takes in a dictionary of all agents + their corresponding actions
            
            for i, id in enumerate(self.agent_ids):
                reward_trajectories[i][t] = rewards[id]

        return observation_trajectories, action_trajectories, reward_trajectories, pred_prob_trajectories

    # def generate_episodes(self, observations, no_episodes, no_timesteps) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    #     '''Generate multiple episodes returning obs, actions, rewards, advantages tensors over all the episodes'''

    #     episodic_advantages = torch.zeros(no_episodes, self.num_agents, no_timesteps)
    #     episodic_obs = torch.zeros(no_episodes, self.num_agents, no_timesteps, 84)
    #     episodic_rewards = torch.zeros(no_episodes, self.num_agents, no_timesteps)
    #     episodic_actions = torch.zeros(no_episodes, self.num_agents, no_timesteps)

    #     episodic_returns = torch.zeros(no_episodes, self.num_agents, no_timesteps)
    #     episodic_pred_probs = torch.zeros(no_episodes, self.num_agents, no_timesteps)
    #     episodic_pred_values = torch.zeros(no_episodes, self.num_agents, no_timesteps)


    #     for ep_i in range(no_episodes): 
    #         observations_trajec, actions_trajec, rewards_trajec, pred_prob_trajec = self._generate_trajectory(observations, no_timesteps)

    #         for i, agent_id in enumerate(self.agent_ids):
                
    #             reversed_advantages = np.zeros((no_timesteps))
    #             reversed_returns =  np.zeros((no_timesteps))
    #             reversed_pred_values = np.zeros((no_timesteps))

    #             running_returns = 0

    #             for t in reversed(range(no_timesteps)): # t is reversed here 
                    
    #                 # calculate returns from rewards 
    #                 rewards = rewards_trajec[i][t]
    #                 running_returns += rewards # we have not implemented a 1 step return, currently it consists of entire return. 
    #                 reversed_returns[t] = running_returns 

    #                 agent_observations_t = observations_trajec[i][t]
    #                 agent_pred_value = self.generate_value(agent_id, agent_observations_t) # simple forward pass in network to calculate value of state. 

    #                 advantage = running_returns - agent_pred_value

    #                 reversed_advantages[t] = advantage
    #                 reversed_pred_values[t] = agent_pred_value


    #             advantages = reversed_advantages[::-1]
    #             returns = reversed_returns[::-1] 
    #             pred_values = reversed_pred_values[::-1]

    #             for t in range(no_timesteps): 
    #                 episodic_advantages[ep_i][i][t] = advantages[t] # append it to the larger episodic tensor
    #                 episodic_returns[ep_i][i][t] = returns[t]
    #                 episodic_pred_values[ep_i][i][t] = pred_values[t]

    #         # append other data to large episode tensor
    #         episodic_obs[ep_i] = observations_trajec
    #         episodic_actions[ep_i] = actions_trajec
    #         episodic_rewards[ep_i] = rewards_trajec
    #         episodic_pred_probs[ep_i] = pred_prob_trajec

    #     return episodic_obs, episodic_actions, episodic_rewards, returns, episodic_advantages, episodic_pred_probs, episodic_pred_values

    def generate_episodes_np(self, observations, no_episodes, no_timesteps) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        '''Generate multiple episodes returning obs, actions, rewards, advantages tensors over all the episodes'''

        episodic_advantages = np.zeros((self.num_agents, no_episodes, no_timesteps))
        episodic_obs = np.zeros((self.num_agents, no_episodes, no_timesteps, 84))
        episodic_rewards = np.zeros((self.num_agents, no_episodes, no_timesteps))
        episodic_actions = np.zeros((self.num_agents, no_episodes, no_timesteps))

        episodic_returns = np.zeros((self.num_agents, no_episodes, no_timesteps))
        episodic_pred_probs = np.zeros((self.num_agents, no_episodes, no_timesteps))
        episodic_pred_values = np.zeros((self.num_agents, no_episodes, no_timesteps))

        for ep_i in range(no_episodes): 
            observations_trajec, actions_trajec, rewards_trajec, pred_prob_trajec = self._generate_trajectory_np(observations, no_timesteps)

            for agent_i, agent_id in enumerate(self.agent_ids):
                
                reversed_advantages = np.zeros((no_timesteps))
                reversed_returns =  np.zeros((no_timesteps))
                reversed_pred_values = np.zeros((no_timesteps))

                running_returns = 0

                for t in reversed(range(no_timesteps)): # t is reversed here 
                    
                    # calculate returns from rewards 
                    rewards = rewards_trajec[agent_i][t]
                    running_returns += rewards # we have not implemented a 1 step return, currently it consists of entire return. 
                    reversed_returns[agent_i] = running_returns 

                    agent_observations_t = observations_trajec[agent_i][t]
                    agent_pred_value = self.generate_value(agent_id, agent_observations_t) # simple forward pass in network to calculate value of state. 

                    advantage = running_returns - agent_pred_value

                    reversed_advantages[t] = advantage
                    reversed_pred_values[t] = agent_pred_value

                advantages = reversed_advantages[::-1]
                returns = reversed_returns[::-1] 
                pred_values = reversed_pred_values[::-1]

                for t in range(no_timesteps): 
                    episodic_advantages[ep_i][agent_i][t] = advantages[t] # append it to the larger episodic tensor
                    episodic_returns[ep_i][agent_i][t] = returns[t]
                    episodic_pred_values[ep_i][agent_i][t] = pred_values[t]

                # append other data to large episode tensor
                episodic_obs[agent_i][ep_i] = observations_trajec[agent_i]
                episodic_actions[agent_i][ep_i] = actions_trajec[agent_i]
                episodic_rewards[agent_i][ep_i] = rewards_trajec[agent_i]
                episodic_pred_probs[agent_i][ep_i] = pred_prob_trajec[agent_i]

        return episodic_obs, episodic_actions, episodic_rewards, episodic_returns, episodic_advantages, episodic_pred_probs, episodic_pred_values

    def __compute_policy_loss(self, old_log_probs, new_log_probs, advantages):
        '''Takes sequence of log_probs and advantages, calculates J(0) which is the prob ratios * advantages'''

        # Calculate the ratio of new and old probabilities
        ratios = torch.exp(new_log_probs - old_log_probs)

        # Calculate surrogate loss
        surr1 = ratios * advantages
        surr2 = torch.clamp(ratios, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * advantages
        policy_loss = -torch.min(surr1, surr2).mean()  # Negative because we perform gradient ascent

        return policy_loss
    
    def __compute_value_loss(self, actual_returns, predicted_values) -> torch.Tensor:

        # Mean squared error loss between predicted and actual returns
        value_loss = F.mse_loss(predicted_values, actual_returns)
        
        return value_loss

    def update_agents_neuralnetwork_old():
        pass
    
    def update_agents_neuralnetwork_old(self):
        '''This function does a deep copy of the current agent policies, returning dictionaries'''

        self.agents_neuralnetwork_old = [{"policy": self.agents_neuralnetwork[i]['policy'].deepCopy(), "value": self.agents_neuralnetwork[i]['value'].deepCopy(), \
                                         "policy_opt": self.agents_neuralnetwork[i]['policy_opt'].deepCopy(), \
                                            "value_opt": self.agents_neuralnetwork[i]['value_opt'].deepCopy()}\
                                         for i in range(self.num_agents)]
        
        return self.agents_neuralnetwork_old


    def update_network(self, agent_enumer, observations_batch, pred_probs_batch, advantages_batch, returns_batch, pred_values_batch):
        '''Expects data in tensor format, returns in tensor format'''
    
        agent_i, agent_id = agent_enumer # unwrap tuple(int,str)

        old_pred_probs_batch = np.zeros((len(observations_batch), 4))
        
        for i, observations in enumerate(observations_batch):
            old_pred_probs_batch[i] = self.agents_neuralnetwork_old[agent_i]['policy'](observations) # calc predicted probabilities from the old network 
        
        policy_loss = self.__compute_policy_loss(old_pred_probs_batch, pred_probs_batch, advantages_batch)
        
        # Backpropagate policy loss
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        # Calculate value loss
        value_loss = self.__compute_value_loss(returns_batch, pred_values_batch)

        # Backpropagate value loss
        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        return policy_loss, value_loss 

    def train(self, minibatch_size, no_episodes, no_timesteps, no_epochs):
        ''''MA Training happens here. Updates the network, keeping track of metrics throughout'''

        train_info_index = 0

        batch_size = no_episodes * no_timesteps # e.g. 4*100*100 -> 40000 this is due to flattening the vectors.  

        sampler_size = batch_size // minibatch_size # from predicting size 

        # All training data will go here 
        train_info = {id:{'policy_loss': np.zeros((no_epochs*sampler_size)), 'value_loss':np.zeros((no_epochs*sampler_size))} for id in self.agent_ids}

        for epoch_i in range(no_epochs):
            
            episodic_obs, episodic_actions, episodic_rewards, episodic_returns, episodic_advantages, \
                episodic_pred_probs, episodic_pred_values = self.generate_episodes_np(multi_agent_env, no_episodes, no_timesteps)
            
            # episodic_obs (no_agents, no_epi, time_steps, dim)

            subset_sampler = SubsetRandomSampler(range(batch_size)) # random assort integers from 1 - 84, put in list [3, 4, 9, 84, ...]
            sampler = BatchSampler(subset_sampler, minibatch_size, True) # divide this list into batches of size minibatch_size [1, 3, ..], [54, 76, 2..]

            for agent_i, agent_id in enumerate(self.agent_ids): # update network for every agent
                agent_obs = episodic_obs[agent_id].reshape(-1,84)

                agent_returns = episodic_returns[agent_id].reshape(-1) # (no_epi * time_steps) 1dim
                agent_pred_values = episodic_pred_values[agent_id].reshape(-1)

                agent_pred_probs = episodic_pred_probs[agent_id].reshape(-1) # (no_epi, time_steps, dim) -> (no_epi*timesteps, dim)
                agent_advantages = episodic_advantages[agent_id].reshape(-1)

                for k, indices in enumerate(sampler):
                    
                    with torch.no_grad():
                        agent_obs_batch = torch.tensor(agent_obs[indices]) # shape = (len(indices), 84)

                        agent_pred_probs_batch = torch.tensor(agent_pred_probs[indices])
                        agent_advantages_batch = torch.tensor(agent_advantages[indices])

                        agent_returns_batch = torch.tensor(agent_returns[indices])
                        agent_pred_values_batch = torch.tensor(agent_pred_values[indices])

                    policy_loss, value_loss = self.update_network((agent_i, agent_id), agent_obs_batch, agent_pred_probs_batch, \
                                                                  agent_advantages_batch, agent_returns_batch, agent_pred_values_batch)
                    

                    train_info[agent_id]['policy_loss'][train_info_index+k] = policy_loss
                    train_info[agent_id]['value_loss'][train_info_index+k] = value_loss

            train_info_index += len(sampler)
        
        return train_info   

In [221]:
ppo_algo = PPO(multi_agent_env, 84, 4)
ppo_algo.train(5, 12, 15, 3)

TypeError: 'aec_to_parallel_wrapper' object is not subscriptable

In [None]:
np.shape(np.zeros((2,2,2,2)))
rand = torch.randperm(64).numpy()
sampler = [rand[i * 4:(i + 1) * 4] for i in range(4)]

sampler

[array([15, 31, 53, 23]),
 array([51, 59, 49, 26]),
 array([57, 18, 27, 45]),
 array([ 3, 46, 25,  5])]

In [None]:
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler

In [None]:
import numpy as np
subset_sampler = SubsetRandomSampler(range(32))
sampler = BatchSampler(subset_sampler, 3, True)

for i, idx in enumerate(sampler):
    print(idx)


[16, 26, 3]
[7, 10, 31]
[2, 0, 12]
[19, 8, 13]
[1, 28, 23]
[27, 22, 24]
[21, 15, 5]
[6, 17, 25]
[9, 30, 20]
[18, 14, 11]


In [None]:
minibatch_size = 3
batch_size = 244444

subset_sampler = SubsetRandomSampler(range(batch_size))
sampler = BatchSampler(subset_sampler, minibatch_size, True)

sampler_size = batch_size // minibatch_size
sampler_size == len(sampler)

True

In [None]:
import torch
torch.randperm(len(range(10)))

tensor([0, 9, 2, 3, 4, 8, 1, 6, 7, 5])

In [None]:
next(iter(subset_sampler))

1

In [None]:
a = np.random.randint(1,43, size =(4,3,1,18))
a.shape

(4, 3, 1, 18)

In [None]:
b = a.reshape(-1, *a.shape[3:])

In [None]:
torch.tensor((5)).item()

5

In [None]:
b.shape

(12, 18)

In [None]:
batch_size = 12
minibatch_size = 5
num_mini_batch = 4

rand = torch.randperm(batch_size).numpy()
sampler = [rand[i * minibatch_size:(i + 1) * minibatch_size] for i in range(num_mini_batch)] # rand[0:4],  rand[4:10], etc
print('rand: ', rand)
print('sampler: ', sampler)

rand:  [ 0 10  2  9  6  8  3  5  7  1  4 11]
sampler:  [array([ 0, 10,  2,  9,  6]), array([8, 3, 5, 7, 1]), array([ 4, 11]), array([], dtype=int64)]


In [None]:
# now we are going to use these indices to 
arrays = []
for indices in sampler:
    minibatch = b[indices]
    arrays.append(minibatch)

In [203]:
arrays

NameError: name 'arrays' is not defined

In [204]:
import torch
a = torch.tensor((2,3))

In [None]:
a.max().numpy()

array(3)

In [None]:
import numpy as np
num_agents = 2
no_episodes = 3
no_timesteps = 5

episodic_obs = np.zeros((num_agents, no_episodes, no_timesteps, 2))

reward_trajectories = np.ones((num_agents, no_timesteps, 2))

In [None]:
episodic_obs

array([[[[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]]],


       [[[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]]]])

In [None]:
reward_trajectories

array([[[1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.]],

       [[1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.]]])

In [None]:
for agent in range(num_agents): 
    episodic_obs[agent][1] = reward_trajectories[agent]

episodic_obs

array([[[[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]]],


       [[[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]]]])

In [None]:
for agent in range(num_agents): 
    episodic_obs[agent][1] = reward_trajectories[agent][:]

episodic_obs

array([[[[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]]],


       [[[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]]]])

In [None]:
a = np.array([3,3,4,5])
a[[1,2,2,2,2,1]]


array([3, 4, 4, 4, 4, 3])

In [None]:
a = np.ones((4,5,5,84))
len(a)

4

In [None]:
b = a.reshape(-1,84)
len(b)
len(b[0])

84

In [None]:
b.shape

(100, 84, 1)

In [155]:
no_epochs = 5
sampler_size = 10


train_info = {id:{'policy_loss': np.zeros((no_epochs*sampler_size)), 'value_loss':np.zeros((no_epochs*sampler_size))} for id in multi_agent_env.agents}


In [160]:
train_info['1']['policy_loss'][0]

0.0

In [168]:
agent_obs = episodic_obs[1].reshape(-1,84)

ValueError: cannot reshape array of size 30 into shape (84)

In [176]:
episodic_obs = np.zeros((3,4,84))
episodic_obs

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

In [183]:
np.shape(episodic_obs.reshape(-1, 84))

(12, 84)

In [184]:
episodic_obs.reshape(-1, 84)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [190]:
multi_agent_env.pos

AttributeError: 'aec_to_parallel_wrapper' object has no attribute 'pos'

In [199]:
observations_batch = np.zeros((5,4))

old_log_probs_batch = np.zeros((len(observations_batch), 4))
for i, observations in enumerate(observations_batch):
    old_log_probs_batch[i] = observations
    print(i)
    



0
1
2
3
4


In [200]:
old_log_probs_batch

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [207]:
np.shape(torch.tensor(np.zeros((3,4))))

torch.Size([3, 4])