## copy of environment class

TODO: reproduce work from agent based here

In [1]:
import gym
from gym import spaces
from typing import Optional
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import seaborn as sns
import os
import glob
import logging
import json
import random
import time
import torch
from collections import Counter
from typing import Optional
sns.set_theme()


def restructure_edges(network):
    """
    This function restructures the edges from list of dicts
    to one dict, to improve construction of edges matrix and 
    env vectorization
    """

    new_edges= {'source_id':[],'target_id':[],'reward':[]}
    for e in network['edges']:
        new_edges['source_id'].append(e['source_id'])
        new_edges['target_id'].append(e['target_id'])
        new_edges['reward'].append(e['reward'])
    return new_edges 


class Reward_Network(gym.Env):
    
    def __init__(self, network, to_log=False):
        
        #-------------
        # assert tests TODO
        #-------------

        # reward network information from json file (can be just one network or multiple networks)
        self.network = network
       
        # initial reward and step values
        self.INIT_REWARD = 0
        self.INIT_STEP = 0
        self.MAX_STEP = 8
        self.N_NODES = 10
        self.N_NETWORKS = len(self.network)

        # define node numbers (from 0 to 9)
        self.nodes = torch.stack([torch.arange(10)]*self.N_NETWORKS,dim = 0)
        # define starting nodes
        self.starting_nodes = torch.tensor(list(map(lambda n: n['starting_node'], self.network)), dtype=torch.long)
        # define possible rewards along with corresponding reward index
        self.possible_rewards = {-100:1, -20:2, 0:3, 20:4, 140:5}

        # initialize action space ("reward adjacency matrix")
        # NOTE intially I thought about the value 0 to be the value that signals that there is no edge between two nodes,
        # however since 0 is also a possible reward I have put 1 as the value in the reward adjacency matrix that represents
        # no edge between two nodes
        self.buffer_action_space = torch.full((self.N_NODES, self.N_NODES), 1).long()  
        self.action_space = torch.full((self.N_NETWORKS,self.N_NODES, self.N_NODES), 1).long()  
        self.new_edges = list(map(restructure_edges,network))
        self.network_idx = torch.arange(self.N_NETWORKS, dtype=torch.long)
        for n in range(self.N_NETWORKS):
            source = torch.tensor(self.new_edges[n]['source_id']).long()
            target = torch.tensor(self.new_edges[n]['target_id']).long()
            reward = torch.tensor(self.new_edges[n]['reward']).long()
            self.buffer_action_space[source,target]=reward
            self.action_space[n,:,:] = self.buffer_action_space


        # get adjacency matrix with reward index instead of actual reward
        # 0 here means that no edge is present, all other indices from 1 to 5 indicate a reward
        # (the higher the index number, the higher the reward)
        # (using this solution for now: https://discuss.pytorch.org/t/mapping-values-in-a-tensor/117731)
        self.action_space_idx = self.action_space.detach().clone()
        self.action_space_idx.apply_(lambda val: self.possible_rewards.get(val, 0))
        print(f'example of action space idx for 1 env: {self.action_space_idx[0,:,:]}')

        # boolean adjacency matrix 
        self.edge_is_present = torch.squeeze(torch.unsqueeze(self.action_space!=1,dim=-1))
        
        #self.all_edges_source = [torch.where(self.action_space[i,:,:]!=1)[0] for i in range(self.N_NETWORKS)] 
        #self.all_edges_target = [torch.where(self.action_space[i,:,:]!=1)[1] for i in range(self.N_NETWORKS)] 
        


    def reset(self):
        # Reset the state of the environment to an initial state
        self.reward_balance = torch.full((self.N_NETWORKS,1),self.INIT_REWARD)
        self.step_counter = torch.full((self.N_NETWORKS,1),self.INIT_STEP)
        self.is_done = False 
        self.current_node = self.starting_nodes.clone()
        #self.current_node = torch.tensor(list(map(lambda n: n['starting_node'], self.network)), dtype=torch.long)

        print('Environment initialized: \n')
        print(f'- set of nodes of shape {self.nodes.shape}')
        print(f'- action space of shape {self.action_space.shape}')
        print(f'- reward balance of shape {self.reward_balance.shape}')
        print(f'- step counter of shape {self.step_counter.shape}')
        print(f'- current node of shape {self.current_node.shape}')

    
    def step(self, action):
        '''
        Take a step in all environments; here action corresponds to the target nodes for each env
        action_i \in [0,1,2,3,4,5,6,7,8,9]
        '''

        self.source_node = self.current_node
        print(f'Source nodes are: {self.current_node}, we are going to new nodes {action}')

        self.rewards = torch.unsqueeze(self.action_space[self.network_idx,self.current_node,action], dim=-1)

        # TODO remove; only used to validate the equivalence during development
        rewards_old = torch.unsqueeze( torch.tensor([self.action_space[i,self.current_node[i],action[i]] for i in range(self.N_NETWORKS)]), dim=-1)
        torch.testing.assert_close(self.rewards, rewards_old)
        # remove end

        print(f'We get rewards : {self.rewards[:,0]}')
        self.reward_balance = torch.add(self.reward_balance,self.rewards)
        print(f'New reward balance is: {self.reward_balance[:,0]}')
        self.current_node = action
        print(f'Now we are in nodes: {self.current_node}')
        self.step_counter = torch.add(self.step_counter,1)
        print(f'Step counter for all networks is: {self.step_counter[:,0]}')
        print('\n')

        if torch.all(self.step_counter == 8):
            self.is_done = True
        
        
    def get_state(self):
        """
        this function returns the current state of the environment.
        State information given by this funciton is less detailed compared
        to the observation. 
        """
        return {'current_node':self.current_node,
                'total_reward':self.reward_balance,
                'n_steps':self.step_counter,
                'done':self.is_done}


    def get_possible_rewards(self,obs):
        """
        this function returns the next possible rewards given an observation;
        the rewards are selected using boolean masking, and the resulting array is split
        into sub-tensors whose size is given by how many valid edges are present in each network
        in the current observation.
        """
        self.n_rewards_per_network = torch.count_nonzero(obs['next_possible_nodes'],dim=1).tolist()
        self.next_rewards_all = torch.masked_select(obs['next_possible_rewards'][self.network_idx],obs['next_possible_nodes'][self.network_idx])
        self.next_rewards_per_network = torch.split(self.next_rewards_all,self.n_rewards_per_network)
        
        return self.next_rewards_per_network

    def observe(self):
        """
        TODO: CHANGE FOR VECTORIZATION
        this function returns observation from the environment
        """
        #self.valid_edges_source = [torch.where(self.all_edges_source[i]==self.current_node[i])[:] for i in range(self.N_NETWORKS)]
        #self.valid_edges_target = [self.all_edges_target[i][self.valid_edges_source[i]] for i in range(self.N_NETWORKS)]
        
        self.next_nodes = torch.squeeze(torch.unsqueeze(self.edge_is_present[self.network_idx,self.current_node,:],dim=-1))
        self.next_rewards = torch.squeeze(torch.unsqueeze(self.action_space[self.network_idx,self.current_node,:],dim=-1))
        self.next_rewards_idx = torch.squeeze(torch.unsqueeze(self.action_space_idx[self.network_idx,self.current_node,:],dim=-1))

        return {'current_node':self.current_node,
                'next_possible_nodes':self.next_nodes,#torch.squeeze(torch.unsqueeze(self.next_nodes[self.network_idx,self.network_idx,:],dim=-1)) ,
                'next_possible_rewards':self.next_rewards, #torch.squeeze(torch.unsqueeze(self.next_rewards[self.network_idx,self.network_idx,:],dim=-1)),
                'next_possible_rewards_idx':self.next_rewards_idx,
                'total_reward':self.reward_balance,
                'n_steps':self.step_counter,
                'done':self.is_done}

## Agent that uses pytorch functions to solve multiple environments

In [13]:
class Agent:

    def __init__(self,strategy: str):
        """
        initialize agent
        """
        assert strategy in ['random','highest_payoff'], f'a strategy {strategy} was given, but availabe strategies are {["random","highest_payoff","take_loss"]}'
        self.strategy = strategy

    def choose_action(self,obs):
        """
        Choose next action given an observation and a strategy
        """

        if self.strategy=='random':

            # create a copy of the next actions index matrix, this will be modified to 
            # select random indices (random next action to take) with torch.multinomial
            #self.next_actions_idx_p = obs['next_possible_rewards_idx'].detach().clone().to(torch.float32)
            # specify a probability matrix, uniform distribution over all valid (not 0) indices for each network index
            # (prob. associated to each valid action depends on the number of valid actions in each network)
            #self.p_one = torch.tensor([1,1,1,1,1])
            #self.p_divided = torch.divide(self.p_one,torch.count_nonzero(self.next_actions_idx_p,dim=1))
            # find where there are valid actions in the matrix
            #self.non_zero_indices = torch.argwhere(self.next_actions_idx_p) 
            # find specific indices of where valid actions are for each network index
            #self.next_actions_idx_list = list(torch.split(self.non_zero_indices,
            #                                              torch.count_nonzero(self.next_actions_idx_p,dim=1).tolist()))

            # TO FIX! still uses for loop! How to improve this?
            #for i in range(len(self.next_actions_idx_list)):
            #    self.next_actions_idx_p[self.next_actions_idx_list[i][:,0],self.next_actions_idx_list[i][:,1]]=self.p_divided[i]
            
            self.next_action = torch.squeeze(torch.multinomial(obs['next_possible_nodes'].type(torch.float),1))
            #self.next_action = torch.squeeze(torch.multinomial(self.next_actions_idx_p,1,replacement=False))
            print(f'shape of next aciton in agent: {self.next_action.shape}')

        elif self.strategy=='highest_payoff':
            self.next_action = torch.argmax(obs['next_possible_rewards_idx'],dim=1)
        
        return self.next_action

## load a few networks and test (with random policy)

In [14]:
data_dir=r"../data"
with open(os.path.join(data_dir,'train.json')) as json_file:
    train = json.load(json_file)

test = train[0:5]

In [15]:
N = Reward_Network(test)
A = Agent('random')
N.reset()

example of action space idx for 1 env: tensor([[0, 0, 2, 0, 0, 1, 0, 0, 0, 0],
        [4, 0, 0, 0, 4, 0, 0, 0, 0, 0],
        [0, 3, 0, 2, 0, 0, 0, 0, 0, 0],
        [0, 4, 0, 0, 2, 0, 0, 0, 0, 0],
        [2, 0, 4, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 3, 0, 0, 3, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 3, 4, 0],
        [0, 0, 0, 0, 0, 2, 0, 0, 0, 3],
        [0, 0, 0, 0, 0, 0, 4, 0, 0, 4],
        [0, 0, 0, 0, 0, 0, 0, 5, 5, 0]])
Environment initialized: 

- set of nodes of shape torch.Size([5, 10])
- action space of shape torch.Size([5, 10, 10])
- reward balance of shape torch.Size([5, 1])
- step counter of shape torch.Size([5, 1])
- current node of shape torch.Size([5])


In [16]:
N.action_space[0,:,:]

tensor([[   1,    1,  -20,    1,    1, -100,    1,    1,    1,    1],
        [  20,    1,    1,    1,   20,    1,    1,    1,    1,    1],
        [   1,    0,    1,  -20,    1,    1,    1,    1,    1,    1],
        [   1,   20,    1,    1,  -20,    1,    1,    1,    1,    1],
        [ -20,    1,   20,    1,    1,    1,    1,    1,    1,    1],
        [   1,    1,    1,    0,    1,    1,    0,    1,    1,    1],
        [   1,    1,    1,    1,    1,    1,    1,    0,   20,    1],
        [   1,    1,    1,    1,    1,  -20,    1,    1,    1,    0],
        [   1,    1,    1,    1,    1,    1,   20,    1,    1,   20],
        [   1,    1,    1,    1,    1,    1,    1,  140,  140,    1]])

In [11]:
N.observe()

{'current_node': tensor([0, 0, 0, 0, 0]),
 'next_possible_nodes': tensor([[False, False,  True, False, False,  True, False, False, False, False],
         [False,  True,  True,  True, False,  True, False, False, False, False],
         [False,  True,  True,  True, False,  True, False, False, False, False],
         [False,  True,  True,  True, False,  True,  True,  True, False, False],
         [False,  True,  True,  True, False,  True,  True,  True, False, False]]),
 'next_possible_rewards': tensor([[   1,    1,  -20,    1,    1, -100,    1,    1,    1,    1],
         [   1,   20,  -20,   20,    1, -100,    1,    1,    1,    1],
         [   1,   20,  -20,   20,    1, -100,    1,    1,    1,    1],
         [   1,   20,  -20,   20,    1, -100, -100, -100,    1,    1],
         [   1,   20,  -20,    0,    1, -100, -100, -100,    1,    1]]),
 'next_possible_rewards_idx': tensor([[0, 0, 2, 0, 0, 1, 0, 0, 0, 0],
         [0, 4, 2, 4, 0, 1, 0, 0, 0, 0],
         [0, 4, 2, 4, 0, 1, 0, 0, 0

In [17]:
# NEW: Using the observation to step in the environment
while N.is_done==False:
    print(f'Current node in the envs: {N.current_node}')
    print('---------------------------------------------')

    obs = N.observe()
    for key, value in obs.items():
        print(key)
        print(value)
    print('\n')

    possible_rewards = N.get_possible_rewards(obs)
    print(f'possible rewards that we can get in next step: \n')
    print(possible_rewards)
    print('\n')
    
    # select action based on observation
    next_action = A.choose_action(obs)
    print(f'Next actions chosen: {next_action}')
    # take a step in envs
    N.step(next_action)

Current node in the envs: tensor([0, 0, 0, 0, 0])
---------------------------------------------
current_node
tensor([0, 0, 0, 0, 0])
next_possible_nodes
tensor([[False, False,  True, False, False,  True, False, False, False, False],
        [False,  True,  True,  True, False,  True, False, False, False, False],
        [False,  True,  True,  True, False,  True, False, False, False, False],
        [False,  True,  True,  True, False,  True,  True,  True, False, False],
        [False,  True,  True,  True, False,  True,  True,  True, False, False]])
next_possible_rewards
tensor([[   1,    1,  -20,    1,    1, -100,    1,    1,    1,    1],
        [   1,   20,  -20,   20,    1, -100,    1,    1,    1,    1],
        [   1,   20,  -20,   20,    1, -100,    1,    1,    1,    1],
        [   1,   20,  -20,   20,    1, -100, -100, -100,    1,    1],
        [   1,   20,  -20,    0,    1, -100, -100, -100,    1,    1]])
next_possible_rewards_idx
tensor([[0, 0, 2, 0, 0, 1, 0, 0, 0, 0],
       

## Try Gym.Spaces to define observation space of the environment in order to use stable-baselines3?

https://stackoverflow.com/questions/56448260/how-could-i-define-the-observation-space-for-my-custom-openai-enviroment
https://github.com/openai/gym/blob/master/gym/spaces/graph.py
https://github.com/openai/gym/issues/2912

https://stable-baselines3.readthedocs.io/en/master/common/env_checker.html (current version of env does not completely follow Gym API since we custom define the action and observation space without using the gym.spaces submodule)
https://stable-baselines3.readthedocs.io/en/master/modules/dqn.html
https://stable-baselines3.readthedocs.io/en/master/guide/vec_envs.html


The problem with the openai gym enviroments is that their space wrappers generate the action space and observation space but there doesn't seem to be much room for customization

A possibility could be to define `Tuple(Discrete(10),Discrete(10))` or `MultiDiscrete ([10,10])` for the action space, and a utility function to check if actions are valid in the current node you're at.

Observation space might be a bit trickier, should include at least current node, possible actions, step counter and done flag