#### Import libraries and modules

In [3]:
# Import foundation
from ai_economist import foundation

In [4]:
import numpy as np
import matplotlib.pyplot as plt
from IPython import display
# from utils import plotting  # plotting utilities for visualizing env. state

import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical

In [4]:
# Logger
import logging.config
import yaml

with open('configs/logging_config.yaml', 'r') as f:
    config = yaml.safe_load(f.read())
    logging.config.dictConfig(config)
    logging.captureWarnings(True)

def get_logger(name: str):
    """Logs a message
    Args:
    name(str): name of logger
    """
    logger = logging.getLogger(name)
    return logger

LOG = get_logger('jupyter')

#### Environment  
Define environment config

In [5]:
env_config = {
    # ===== SCENARIO CLASS =====
    # Which Scenario class to use: the class's name in the Scenario Registry (foundation.scenarios).
    # The environment object will be an instance of the Scenario class.
    'scenario_name': 'layout_from_file/simple_wood_and_stone',
    
    # ===== COMPONENTS =====
    # Which components to use (specified as list of ("component_name", {component_kwargs}) tuples).
    #   "component_name" refers to the Component class's name in the Component Registry (foundation.components)
    #   {component_kwargs} is a dictionary of kwargs passed to the Component class
    # The order in which components reset, step, and generate obs follows their listed order below.
    'components': [
        # (1) Building houses
        ('Build', {'skill_dist': "pareto", 'payment_max_skill_multiplier': 3}),
        # (2) Trading collectible resources
        ('ContinuousDoubleAuction', {'max_num_orders': 5}),
        # (3) Movement and resource collection
        ('Gather', {}),
    ],
    
    # ===== SCENARIO CLASS ARGUMENTS =====
    # (optional) kwargs that are added by the Scenario class (i.e. not defined in BaseEnvironment)
    'env_layout_file': 'quadrant_25x25_20each_30clump.txt',
    'starting_agent_coin': 10,
    'fixed_four_skill_and_loc': True,
    
    # ===== STANDARD ARGUMENTS ======
    # kwargs that are used by every Scenario class (i.e. defined in BaseEnvironment)
    'n_agents': 2,          # Number of non-planner agents (must be > 1)
    'world_size': [25, 25], # [Height, Width] of the env world
    'episode_length': 1000, # Number of timesteps per episode
    
    # In multi-action-mode, the policy selects an action for each action subspace (defined in component code).
    # Otherwise, the policy selects only 1 action.
    'multi_action_mode_agents': False,
    'multi_action_mode_planner': True,
    
    # When flattening observations, concatenate scalar & vector observations before output.
    # Otherwise, return observations with minimal processing.
    'flatten_observations': False,
    # When Flattening masks, concatenate each action subspace mask into a single array.
    # Note: flatten_masks = True is required for masking action logits in the code below.
    'flatten_masks': True,
}

Create environemnt instance using env_config config.  
It's equivalent to `gym.make(env_name)`


In [6]:
env = foundation.make_env_instance(**env_config)

In [7]:
obs = env.reset()

In [14]:
aaa = obs
print(len((aaa['p']).keys()))

17


### Info
i dictionary degli agenti hanno 24 elementi e come struttura sono uguali.


AttributeError: 'dict' object has no attribute 'shape'

## Agent class and nn

In [33]:
# env.seed()

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer 

# TODO: capire cosa fa `np.array(envs.single_observation_space.shape).prod()`
# probabilmente tira fuori la forma dell'np_array di una singola osservazione
# -> passare qui come "envs" degli environment gia' trasformati in `numpy_env` 

class Agent(nn.Module):
    def __init__(self, envs):
        super(Agent, self).__init__()
        self.critic = nn.Sequential(
            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 1), std=1.0),
        )
        self.actor = nn.Sequential(
            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01),
        )
        

###### Actions need to be a dictionary of `id_agent` (obviusly string):`action_id` 

In [11]:
# state == observation

# def sample_random_action(agent, mask, rew = 0):
#     """
#     Sample random UNMASKED action(s) for agent.
#     Args:
#         agent (ai_economist.foundation.agents.mobiles.BasicMobileAgent):
#         mask (array([1.], dtype=float32)): agent action mask
#         rew (dict): dictionary containing agent id and reward value FIXME
#     """
#     # Return a list of actions: 1 for each action subspace
#     # print(f"AGENT reward: {rew}")
    
#     if agent.multi_action_mode:
#         # by default used by the planner
#         split_masks = np.split(mask, agent.action_spaces.cumsum()[:-1])
#         return [np.random.choice(np.arange(len(m_)), p=m_/m_.sum()) for m_ in split_masks]

#     # Return a single action
#     else:
#         # here agent's training should be implemented
#         return np.random.choice(np.arange(agent.action_spaces), p=mask/mask.sum())

# def sample_random_actions(env, obs, rew = 0):
#     """
#     Samples random UNMASKED actions for each agent in obs.
#     Args:
#         env (ai_economist.foundation.scenarios): the used environment
#         obs (env.reset()): env.reset()
#         rew (): dictionary of agent:reward(float)
        
#     """
        
#     actions = {
#         a_idx: sample_random_action(env.get_agent(a_idx), a_obs['action_mask'], rew[str(a_idx)])
#         for a_idx, a_obs in obs.items()
#     }

#     return actions

### TEST
# obs = env.reset()
# a ctions = sample_random_actions(env, obs)
# print(actions)
#obs, rew, done, info = env.step(actions)






In [28]:
def sample_random_actions(env, obs, rew = 0):
    """
    Returns: dict of actions agent_id : action
    If agent then PPO, if planner do nothing.
    """
        
    """
    actions = {
        a_idx: sample_random_action(env.get_agent(a_idx), a_obs['action_mask'], rew[str(a_idx)])
        for a_idx, a_obs in obs.items()
    }
    """
    actions = {}
    for a_idx, a_obs in obs.items():
        
        agent = env.get_agent(a_idx)
        mask = a_obs['action_mask']
        reward = rew[str(a_idx)]

        # fixed for the planner
        if agent.multi_action_mode and a_idx == 'p':
            split_masks = np.split(mask, agent.action_spaces.cumsum()[:-1])
            action = [np.random.choice(np.arange(len(m_)), p=m_/m_.sum()) for m_ in split_masks]

        # Return a single action -> agent only
        else:
            # PPO
            
            action = np.random.choice(np.arange(agent.action_spaces), p=mask/mask.sum())
            # LOG.debug(f"PPO - {a_idx} {action}")
        actions[a_idx] = action
    # print(actions)
    return actions

In [29]:
def rewards_dictionary(env):   
    """
    Fills and prepares rewards dictionary with all agents/planner ids and sets rewards to None
    
    Args:
        env (ai_economist.foundation.scenarios): the used environment
    
    Returns:
        rewards (dict): agent_id : agent_reward
    """
    obs = env.reset()
    rewards = {}

    for a_idx, _ in obs.items():
        rewards[str(a_idx)] = None
    
    return rewards

rewards = rewards_dictionary(env)

for i in range(1, 10):
    
    print(f"Epoch: {i}")
    
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        actions = sample_random_actions(env, obs, rewards)
        # print(actions)
        obs, rew, done, info = env.step(actions)
        # print(rew)
        rewards = rew

Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
