## copy of environment class

In [155]:
import gym
from gym import spaces
from typing import Optional
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import seaborn as sns
import os
import glob
import logging
import json
import random
import time
import torch
from collections import Counter
sns.set_theme()


def restructure_edges(network):
    """
    This function restructures the edges from list of dicts
    to one dict, to improve construction of edges matrix and 
    env vectorization
    """

    new_edges= {'source_id':[],'target_id':[],'reward':[]}
    for e in network['edges']:
        new_edges['source_id'].append(e['source_id'])
        new_edges['target_id'].append(e['target_id'])
        new_edges['reward'].append(e['reward'])
    return new_edges 


log_dir = r'C:\Users\Sara Bonati\Desktop\MPI_work\Machines\Reward_network_task\logs\dev'

class Reward_Network(gym.Env):
    
    def __init__(self, network, to_log=False):
        
        #-------------
        # assert tests
        #-------------

        # reward network information from json file (can be just one network or multiple networks)
        self.network = network
       
        # initial reward and step values
        self.INIT_REWARD = 0
        self.INIT_STEP = 0
        self.MAX_STEP = 8
        self.N_NODES = 10

        # network info
        #self.id = self.network['network_id']
        
        self.nodes = torch.stack([torch.arange(10)]*3,dim = 0)

        # initialize action space
        self.buffer_action_space = torch.full((self.N_NODES, self.N_NODES), 3).long()
        self.action_space = torch.full((len(self.network),self.N_NODES, self.N_NODES), 3).long()
        self.new_edges = list(map(restructure_edges,network))
        for n in range(len(self.network)):
            source = torch.tensor(self.new_edges[n]['source_id']).long()
            target = torch.tensor(self.new_edges[n]['target_id']).long()
            reward = torch.tensor(self.new_edges[n]['reward']).long()
            self.buffer_action_space[source,target]=reward
            self.action_space[n,:,:] = self.buffer_action_space

        
        self.possible_rewards = [-100, -20, 0, 20, 140]
        #self.reward_range = (min(self.possible_rewards)*self.MAX_STEP,self.network['max_reward'])

    def reset(self):
        # Reset the state of the environment to an initial state
        self.reward_balance = torch.full((len(self.network),1),self.INIT_REWARD)
        self.step_counter = torch.full((len(self.network),1),self.INIT_STEP)
        self.is_done = False #torch.full((len(self.network),1),False)
        self.current_node = torch.IntTensor(list(map(lambda n: n['starting_node'], self.network)))

        print('Environment initialized: \n')
        print(f'- set of nodes of shape {self.nodes.shape}')
        print(f'- action space of shape {self.action_space.shape}')
        print(f'- reward balance of shape {self.reward_balance.shape}')
        print(f'- step counter of shape {self.step_counter.shape}')
        print(f'- current node of shape {self.current_node.shape}')

    
    def step(self, action):

        self.source_node = self.current_node
        print(f'Source nodes are: {self.current_node}')
        self.rewards = torch.unsqueeze( torch.tensor([self.action_space[i,self.current_node[i],action[i]] for i in range(len(self.network))]), dim=-1)
        print(f'We get rewards : {self.rewards[:,0]}')
        self.reward_balance = torch.add(self.reward_balance,self.rewards)
        print(f'New reward balance is: {self.reward_balance[:,0]}')
        self.current_node = action
        print(f'Now we are in nodes: {self.current_node}')
        self.step_counter = torch.add(self.step_counter,1)
        print(f'Step counter for all networks is: {self.step_counter[:,0]}')
        print('\n')

        if torch.all(self.step_counter == 8):
            self.is_done = True
           
        #return {'source_node':self.source_node,
        #        'current_node':self.current_node,
        #        'reward':action['reward'],
        #        'total_reward':self.reward_balance,
        #        'n_steps':self.step_counter,
        #        'done':self.is_done}


    def get_state(self):
        """
        TODO: CHANGE FOR VECTORIZATION
        this function returns the current state of the environment.
        State information given by this funciton is less detailed compared
        to the observation. 
        """
        return {'current_node':self.current_node,
                'total_reward':self.reward_balance,
                'n_steps':self.step_counter,
                'done':self.is_done}

    def observe(self):
        """
        TODO: CHANGE FOR VECTORIZATION
        this function returns observation from the environment
        """
        return {'current_node':self.current_node,
                'actions_available':[n for n in self.action_space if n['source_id'] == self.current_node],
                'next_possible_nodes':np.asarray([n['target_id'] for n in self.action_space if n['source_id'] == self.current_node]),
                'next_possible_rewards':np.asarray([n['reward'] for n in self.action_space if n['source_id'] == self.current_node]),
                'total_reward':self.reward_balance,
                'n_steps':self.step_counter,
                'done':self.is_done}

  if LooseVersion(mpl.__version__) >= "3.0":
  other = LooseVersion(other)


## load a few networks and test (with random policy)

In [163]:
data_dir=r"C:\Users\Sara Bonati\Desktop\MPI_work\Machines\Reward_network_task\data\rawdata"
with open(os.path.join(data_dir,'train.json')) as json_file:
    train = json.load(json_file)

test = train[0:3]

In [178]:
N = Reward_Network(test)
N.reset()

Environment initialized: 

- set of nodes of shape torch.Size([3, 10])
- action space of shape torch.Size([3, 10, 10])
- reward balance of shape torch.Size([3, 1])
- step counter of shape torch.Size([3, 1])
- current node of shape torch.Size([3])


In [179]:
while N.is_done==False:
    print(f'Current node in the envs: {N.current_node}')
    
    all_edges_source = [torch.where(N.action_space[i,:,:]!=3)[0] for i in range(len(test))]
    print(f'all_edges_source (all the row indices of actions where reward is not null)')
    print(*all_edges_source,sep='\n')
    all_edges_target = [torch.where(N.action_space[i,:,:]!=3)[1] for i in range(len(test))]
    print(f'all_edges_target (all the column indices of actions where reward is not null)')
    print(*all_edges_target,sep='\n')
    valid_edges_source = [torch.where(all_edges_source[i]==N.current_node[i]) for i in range(len(test))]
    print(f'valid_edges_source (all the row indices of actions where reward is not null AND the source node is the current node in env)')
    print(*valid_edges_source,sep='\n')
    valid_edges_target = [all_edges_target[i][valid_edges_source[i]] for i in range(len(test))]
    print(f'valid_edges_target (all the column indices of actions where reward is not null AND the source node is the current node in env)')
    print(*valid_edges_target,sep='\n')
    print('\n')
    random_action = torch.stack(list(map(lambda x: random.choice(x),valid_edges_target)), dim=0)
    print(f'Actions (or target edge) ranodmly selected in each env: {random_action}')

    N.step(random_action)
    

Current node in the envs: tensor([0, 0, 0], dtype=torch.int32)
all_edges_source (all the row indices of actions where reward is not null)
tensor([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9])
tensor([0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7,
        7, 7, 7, 8, 8, 8, 9, 9, 9])
tensor([0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5,
        5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 9])
all_edges_target (all the column indices of actions where reward is not null)
tensor([2, 5, 0, 4, 1, 3, 1, 4, 0, 2, 3, 6, 7, 8, 5, 9, 6, 9, 7, 8])
tensor([1, 2, 3, 5, 0, 2, 4, 1, 3, 5, 0, 1, 4, 5, 0, 2, 3, 0, 3, 6, 7, 8, 9, 5,
        6, 8, 9, 6, 7, 9, 4, 7, 8])
tensor([1, 2, 3, 5, 0, 2, 4, 1, 3, 4, 5, 0, 1, 2, 4, 5, 0, 2, 3, 5, 7, 0, 3, 6,
        8, 0, 7, 8, 9, 5, 6, 8, 9, 5, 6, 7, 9, 2, 3, 4, 7, 8])
valid_edges_source (all the row indices of actions where reward is not null AND the source node is the current node in env)
(ten