## copy of environment class

In [4]:
import gym
from gym import spaces
from typing import Optional
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import seaborn as sns
import os
import glob
import logging
import json
import random
import time
import torch
from collections import Counter
sns.set_theme()


def restructure_edges(network):
    """
    This function restructures the edges from list of dicts
    to one dict, to improve construction of edges matrix and 
    env vectorization
    """

    new_edges= {'source_id':[],'target_id':[],'reward':[]}
    for e in network['edges']:
        new_edges['source_id'].append(e['source_id'])
        new_edges['target_id'].append(e['target_id'])
        new_edges['reward'].append(e['reward'])
    return new_edges 


class Reward_Network(gym.Env):
    
    def __init__(self, network, to_log=False):
        
        #-------------
        # assert tests TODO
        #-------------

        # reward network information from json file (can be just one network or multiple networks)
        self.network = network
       
        # initial reward and step values
        self.INIT_REWARD = 0
        self.INIT_STEP = 0
        self.MAX_STEP = 8
        self.N_NODES = 10
        self.N_NETWORKS = len(self.network)

        # define node numbers (from 0 to 9)
        self.nodes = torch.stack([torch.arange(10)]*self.N_NETWORKS,dim = 0)

        # initialize action space ("reward adjacency matrix")
        # NOTE intially I thought about the value 0 to be the value that signals that there is no edge between two nodes,
        # however since 0 is also a possible reward I have put 1 as the value in the reward adjacency matrix that represents
        # no edge between two nodes
        self.buffer_action_space = torch.full((self.N_NODES, self.N_NODES), 1).long()  
        self.action_space = torch.full((self.N_NETWORKS,self.N_NODES, self.N_NODES), 1).long()  
        self.new_edges = list(map(restructure_edges,network))
        for n in range(self.N_NETWORKS):
            source = torch.tensor(self.new_edges[n]['source_id']).long()
            target = torch.tensor(self.new_edges[n]['target_id']).long()
            reward = torch.tensor(self.new_edges[n]['reward']).long()
            self.buffer_action_space[source,target]=reward
            self.action_space[n,:,:] = self.buffer_action_space


        print(f'example of action space for 1 env: {self.action_space[0,:,:]}')
        # all_edges_source (all the row indices of actions where reward is not null)
        self.all_edges_source = [torch.where(self.action_space[i,:,:]!=1)[0] for i in range(self.N_NETWORKS)] 
        print(f'example of all edges source for 1 env: {self.all_edges_source[0]}')
        # all_edges_target (all the column indices of actions where reward is not null)
        self.all_edges_target = [torch.where(self.action_space[i,:,:]!=1)[1] for i in range(self.N_NETWORKS)] 
        print(f'example of all edges target for 1 env: {self.all_edges_target[0]}')
        
        self.possible_rewards = [-100, -20, 0, 20, 140]


    def reset(self):
        # Reset the state of the environment to an initial state
        self.reward_balance = torch.full((self.N_NETWORKS,1),self.INIT_REWARD)
        self.step_counter = torch.full((self.N_NETWORKS,1),self.INIT_STEP)
        self.is_done = False #torch.full((len(self.network),1),False)
        self.current_node = torch.IntTensor(list(map(lambda n: n['starting_node'], self.network)))

        print('Environment initialized: \n')
        print(f'- set of nodes of shape {self.nodes.shape}')
        print(f'- action space of shape {self.action_space.shape}')
        print(f'- reward balance of shape {self.reward_balance.shape}')
        print(f'- step counter of shape {self.step_counter.shape}')
        print(f'- current node of shape {self.current_node.shape}')

    
    def step(self, action):
        '''
        Take a step in all environments; here action corresponds to the target nodes for each env
        action_i \in [0,1,2,3,4,5,6,7,8,9]
        '''

        self.source_node = self.current_node
        print(f'Source nodes are: {self.current_node}, we are going to new nodes {action}')
        self.rewards = torch.unsqueeze( torch.tensor([self.action_space[i,self.current_node[i],action[i]] for i in range(self.N_NETWORKS)]), dim=-1)
        print(f'We get rewards : {self.rewards[:,0]}')
        self.reward_balance = torch.add(self.reward_balance,self.rewards)
        print(f'New reward balance is: {self.reward_balance[:,0]}')
        self.current_node = action
        print(f'Now we are in nodes: {self.current_node}')
        self.step_counter = torch.add(self.step_counter,1)
        print(f'Step counter for all networks is: {self.step_counter[:,0]}')
        print('\n')

        if torch.all(self.step_counter == 8):
            self.is_done = True
           
        #return {'source_node':self.source_node,
        #        'current_node':self.current_node,
        #        'reward':action['reward'],
        #        'total_reward':self.reward_balance,
        #        'n_steps':self.step_counter,
        #        'done':self.is_done}


    def get_state(self):
        """
        this function returns the current state of the environment.
        State information given by this funciton is less detailed compared
        to the observation. 
        """
        return {'current_node':self.current_node,
                'total_reward':self.reward_balance,
                'n_steps':self.step_counter,
                'done':self.is_done}


    def get_possible_rewards(self,env_action_space,source_idx,target_idx):
        """
        this function returns the rewards associated to the row (source node)
        and column (target node) indices of an environment

        TODO: fix
        """
        return torch.gather(env_action_space,1,target_idx)
        #return env_action_space[source_idx.chunk(chunks=len(source_idx),dim=0),target_idx.chunk(chunks=len(target_idx),dim=0)]

    def observe(self):
        """
        TODO: CHANGE FOR VECTORIZATION
        this function returns observation from the environment
        """
        #self.valid_edges_source = [torch.where(self.all_edges_source[i]==self.current_node[i])[:] for i in range(self.N_NETWORKS)]
        self.valid_edges_source = [torch.where(self.all_edges_source[i]==self.current_node[i])[0][:] for i in range(self.N_NETWORKS)]
        self.valid_edges_target = [self.all_edges_target[i][self.valid_edges_source[i]] for i in range(self.N_NETWORKS)]

        return {'current_node':self.current_node,
                'next_possible_nodes':self.valid_edges_target,
                #'next_possible_rewards':torch.stack(list(map(lambda x: x[self.valid_edges_source,self.valid_edges_target],self.action_space)), dim=0),
                'next_possible_rewards':[self.get_possible_rewards(self.action_space[i,:,:],self.valid_edges_source[i],self.valid_edges_target[i]) for i in range(self.N_NETWORKS)],
                'total_reward':self.reward_balance,
                'n_steps':self.step_counter,
                'done':self.is_done}

    #def observe(self):
    #    """
    #    TODO: CHANGE FOR VECTORIZATION
    #    this function returns observation from the environment
    #    """
    #    return {'current_node':self.current_node,
    #            'actions_available':[n for n in self.action_space if n['source_id'] == self.current_node],
    #            'next_possible_nodes':np.asarray([n['target_id'] for n in self.action_space if n['source_id'] == self.current_node]),
    #            'next_possible_rewards':np.asarray([n['reward'] for n in self.action_space if n['source_id'] == self.current_node]),
    #            'total_reward':self.reward_balance,
    #            'n_steps':self.step_counter,
    #            'done':self.is_done}

## load a few networks and test (with random policy)

In [5]:
data_dir=r"/Users/bonati/Desktop/CHM/reward_networks/data/rawdata"
with open(os.path.join(data_dir,'train.json')) as json_file:
    train = json.load(json_file)

test = train[0:5]

In [6]:
N = Reward_Network(test)
N.reset()

example of action space for 1 env: tensor([[   1,    1,  -20,    1,    1, -100,    1,    1,    1,    1],
        [  20,    1,    1,    1,   20,    1,    1,    1,    1,    1],
        [   1,    0,    1,  -20,    1,    1,    1,    1,    1,    1],
        [   1,   20,    1,    1,  -20,    1,    1,    1,    1,    1],
        [ -20,    1,   20,    1,    1,    1,    1,    1,    1,    1],
        [   1,    1,    1,    0,    1,    1,    0,    1,    1,    1],
        [   1,    1,    1,    1,    1,    1,    1,    0,   20,    1],
        [   1,    1,    1,    1,    1,  -20,    1,    1,    1,    0],
        [   1,    1,    1,    1,    1,    1,   20,    1,    1,   20],
        [   1,    1,    1,    1,    1,    1,    1,  140,  140,    1]])
example of all edges source for 1 env: tensor([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9])
example of all edges target for 1 env: tensor([2, 5, 0, 4, 1, 3, 1, 4, 0, 2, 3, 6, 7, 8, 5, 9, 6, 9, 7, 8])
Environment initialized: 

- set of nodes of sha

In [7]:
N.action_space[0,:,:]

tensor([[   1,    1,  -20,    1,    1, -100,    1,    1,    1,    1],
        [  20,    1,    1,    1,   20,    1,    1,    1,    1,    1],
        [   1,    0,    1,  -20,    1,    1,    1,    1,    1,    1],
        [   1,   20,    1,    1,  -20,    1,    1,    1,    1,    1],
        [ -20,    1,   20,    1,    1,    1,    1,    1,    1,    1],
        [   1,    1,    1,    0,    1,    1,    0,    1,    1,    1],
        [   1,    1,    1,    1,    1,    1,    1,    0,   20,    1],
        [   1,    1,    1,    1,    1,  -20,    1,    1,    1,    0],
        [   1,    1,    1,    1,    1,    1,   20,    1,    1,   20],
        [   1,    1,    1,    1,    1,    1,    1,  140,  140,    1]])

In [8]:
N.observe()

RuntimeError: Index tensor must have the same number of dimensions as input tensor

In [9]:
while N.is_done==False:
    print(f'Current node in the envs: {N.current_node}')
    print('---------------------------------------------')
    
    valid_edges_source = [torch.where(N.all_edges_source[i]==N.current_node[i]) for i in range(len(test))]
    print(f'valid_edges_source (all the row indices of actions where reward is not null AND the source node is the current node in env)')
    print(*valid_edges_source,sep='\n')
    valid_edges_target = [N.all_edges_target[i][valid_edges_source[i]] for i in range(len(test))]
    print(f'valid_edges_target (all the column indices of actions where reward is not null AND the source node is the current node in env)')
    print(*valid_edges_target,sep='\n')
    print('\n')
    random_action = torch.stack(list(map(lambda x: random.choice(x),valid_edges_target)), dim=0)
    print(f'Actions (or target edge) ranodmly selected in each env: {random_action}')

    N.step(random_action)
    

Current node in the envs: tensor([0, 0, 0, 0, 0], dtype=torch.int32)
---------------------------------------------
valid_edges_source (all the row indices of actions where reward is not null AND the source node is the current node in env)
(tensor([0, 1]),)
(tensor([0, 1, 2, 3]),)
(tensor([0, 1, 2, 3]),)
(tensor([0, 1, 2, 3, 4, 5]),)
(tensor([0, 1, 2, 3, 4, 5]),)
valid_edges_target (all the column indices of actions where reward is not null AND the source node is the current node in env)
tensor([2, 5])
tensor([1, 2, 3, 5])
tensor([1, 2, 3, 5])
tensor([1, 2, 3, 5, 6, 7])
tensor([1, 2, 3, 5, 6, 7])


Actions (or target edge) ranodmly selected in each env: tensor([5, 1, 1, 7, 7])
Source nodes are: tensor([0, 0, 0, 0, 0], dtype=torch.int32), we are going to new nodes tensor([5, 1, 1, 7, 7])
We get rewards : tensor([-100,   20,   20, -100, -100])
New reward balance is: tensor([-100,   20,   20, -100, -100])
Now we are in nodes: tensor([5, 1, 1, 7, 7])
Step counter for all networks is: tensor

## Try Gym.Spaces to define observation space of the environment in order to use stable-baselines3?

https://stackoverflow.com/questions/56448260/how-could-i-define-the-observation-space-for-my-custom-openai-enviroment
https://github.com/openai/gym/blob/master/gym/spaces/graph.py
https://github.com/openai/gym/issues/2912

https://stable-baselines3.readthedocs.io/en/master/common/env_checker.html (current version of env does not completely follow Gym API since we custom define the action and observation space without using the gym.spaces submodule)
https://stable-baselines3.readthedocs.io/en/master/modules/dqn.html
https://stable-baselines3.readthedocs.io/en/master/guide/vec_envs.html


The problem with the openai gym enviroments is that their space wrappers generate the action space and observation space but there doesn't seem to be much room for customization

A possibility could be to define `Tuple(Discrete(10),Discrete(10))` or `MultiDiscrete ([10,10])` for the action space, and a utility function to check if actions are valid in the current node you're at.

Observation space might be a bit trickier, should include at least current node, possible actions, step counter and done flag