In [1]:
import numpy as np

In [2]:
from sklearn.metrics.pairwise import euclidean_distances

In [3]:
positions = np.arange(10).reshape(-1,2)

In [4]:
positions

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

In [5]:
dist_matrix = euclidean_distances(positions)
dist_matrix

array([[ 0.        ,  2.82842712,  5.65685425,  8.48528137, 11.3137085 ],
       [ 2.82842712,  0.        ,  2.82842712,  5.65685425,  8.48528137],
       [ 5.65685425,  2.82842712,  0.        ,  2.82842712,  5.65685425],
       [ 8.48528137,  5.65685425,  2.82842712,  0.        ,  2.82842712],
       [11.3137085 ,  8.48528137,  5.65685425,  2.82842712,  0.        ]])

In [6]:
dist_matrix[dist_matrix<5] = 0

In [7]:
dist_matrix

array([[ 0.        ,  0.        ,  5.65685425,  8.48528137, 11.3137085 ],
       [ 0.        ,  0.        ,  0.        ,  5.65685425,  8.48528137],
       [ 5.65685425,  0.        ,  0.        ,  0.        ,  5.65685425],
       [ 8.48528137,  5.65685425,  0.        ,  0.        ,  0.        ],
       [11.3137085 ,  8.48528137,  5.65685425,  0.        ,  0.        ]])

In [8]:
from spektral.layers import GraphConv
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.backend import gather, squeeze
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2


## First trail

In [9]:
class GraphicEncoder():
    def __init__(self, F):
        self.model = self.build_model(F)

    def build_model(self,F):
        X_in = Input(shape=(F,), name='X_in')
        A_in = Input(shape=(None,), name='A_in')
        RL_indice = Input(shape=(None,),batch_size = 1, name='RL_indice',dtype='int32')

        
        ### Graphic convolution
        
        x = GraphConv(32, activation='relu',name='gcn1')([X_in, A_in])
        x = GraphConv(32, activation='relu',name='gcn2')([x, A_in])
        x = Lambda(lambda x: gather(x,RL_indice),name='slice')(x)
        
        x = Lambda(lambda x: squeeze(x,0),name='squeeze')(x)
        
        ### Policy network
        
        x = Dense(32,activation='relu',name='policy_1')(x)
        x = Dense(16,activation='relu',name='policy_2')(x)
        x = Dense(3, activation='relu',name='policy_3')(x)
        
        
        model = Model(inputs = [X_in,A_in,RL_indice], outputs=x)
        print(model.summary())
        return model
    
    
feature_size = 9
rl_model = GraphicEncoder(feature_size)


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
X_in (InputLayer)               [(None, 9)]          0                                            
__________________________________________________________________________________________________
A_in (InputLayer)               [(None, None)]       0                                            
__________________________________________________________________________________________________
gcn1 (GraphConv)                (None, 32)           320         X_in[0][0]                       
                                                                 A_in[0][0]                       
__________________________________________________________________________________________________
gcn2 (GraphConv)                (None, 32)           1056        gcn1[0][0]                   

In [10]:
from gym.spaces.box import Box
from gym.spaces import Discrete
from gym.spaces.dict import Dict

In [11]:
from ray.rllib.models.tf.tf_modelv2 import TFModelV2

W0519 21:04:10.313662 4385142208 deprecation.py:323] From /anaconda3/envs/flow/lib/python3.6/site-packages/tensorflow_core/python/compat/v2_compat.py:88: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
Instructions for updating:
non-resource variables are not supported in the long term


In [12]:
F = 9
N = 40

states = Box(low=-np.inf, high=np.inf, shape=(N,F), dtype=np.float32)
adjacency = Box(low=0, high=1, shape = (N,N), dtype=np.int32)
mask = Box(low=0, high=1, shape = (N,), dtype=np.int32)

obs_space = Dict({'states':states,'adjacency':adjacency,'mask':mask})
act_space = Box(low=0, high=1, shape = (N,), dtype=np.int32)

## Second trail

In [13]:
from spektral.layers import GraphConv
from tensorflow.keras.layers import Input, Dense, Lambda, Multiply, Reshape, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

class GraphicPolicy(TFModelV2):
    def __init__(self, N,F, obs_space, action_space, num_outputs=3, model_config=None, name='graphic_policy'):
        super(GraphicPolicy,self).__init__(obs_space, action_space, num_outputs, model_config, name)
        self.base_model = self.build_model(N,F,num_outputs)
        self.register_variables(self.base_model.variables)
        
    def build_model(self,N,F,num_outputs):
        X_in = Input(shape=(N,F), name='X_in')
        A_in = Input(shape=(N,N), name='A_in')
        RL_indice = Input(shape=(N), name='mask')
        
        ### Graphic convolution
        
        x = GraphConv(32, activation='relu',name='gcn1')([X_in, A_in])
        x = GraphConv(32, activation='relu',name='gcn2')([x, A_in])

        
        ### Policy network
        
        x1 = Dense(32,activation='relu',name='policy_1')(x)
        x2 = Dense(16,activation='relu',name='policy_2')(x1)
        
    
        ###  Action and filter
        x3 = Dense(num_outputs, activation='relu',name='policy_3')(x2)
        mask = Reshape((N,1),name='expend_dim')(RL_indice)
        out = Multiply(name='filter')([x3,mask])
        
        
        #### Value out
        x2 = Flatten(name='flatten')(x2)
        value = Dense(1,activation='linear',name='value_out')(x2)
        

        model = Model(inputs = [X_in,A_in,RL_indice], outputs=[out,value])
        print(model.summary())
        return model
    
    def forward(self, input_dict, state, seq_lens):
        obs = input_dic['obs']
        model_out, self._value_out = self.base_model({X_in:obs['states'],A_in:obs['adjacency'],RL_indice:obs['mask']})        
        return model_out,state
        
    def value_function(self):
        return tf.reshape(self._value_out, [-1])
    
model_config = {}
rl_model = GraphicPolicy(N,F, obs_space, act_space)

W0519 21:04:10.664104 4385142208 deprecation.py:506] From /anaconda3/envs/flow/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1635: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
X_in (InputLayer)               [(None, 40, 9)]      0                                            
__________________________________________________________________________________________________
A_in (InputLayer)               [(None, 40, 40)]     0                                            
__________________________________________________________________________________________________
gcn1 (GraphConv)                (None, 40, 32)       320         X_in[0][0]                       
                                                                 A_in[0][0]                       
__________________________________________________________________________________________________
gcn2 (GraphConv)                (None, 40, 32)       1056        gcn1[0][0]                 

## Test for the model + agents

In [14]:
import pickle

In [15]:
with open('training_data.pkl','rb') as f:
    training_data = pickle.load(f)

In [16]:
training_data.keys()

dict_keys(['state', 'action', 'reward', 'done'])

In [17]:
len(training_data['state'])

2292

In [18]:
training_data['state'][300][1].shape

(40, 40)

In [19]:

from rl.memory import SequentialMemory

In [21]:
memory_buffer = SequentialMemory(limit=5000, window_length=1)
for obs,a, r, temin in zip(training_data['state'],training_data['action'], training_data['reward'], training_data['done']):
    if isinstance(a,np.ndarray):
        memory_buffer.append(obs,a,r,temin)

In [23]:
batch_experience = memory_buffer.sample(32)

state0_batch = []
reward_batch = []
action_batch = []
terminal1_batch = []
state1_batch = []
for e in batch_experience:
    state0_batch.append(e.state0)
    state1_batch.append(e.state1)
    reward_batch.append(e.reward)
    action_batch.append(e.action)
    terminal1_batch.append(0. if e.terminal1 else 1.)

In [25]:
from rl.processors import Processor
class Jiqian_MultiInputProcessor(Processor):
    """
    The multi input preprocessor for the model
    """
    def __init__(self, nb_inputs):
            self.nb_inputs = nb_inputs

    def process_state_batch(self, state_batch):
        input_batches = [[] for x in range(self.nb_inputs)]
        # print((state_batch))
        for state in state_batch:
            for observation in state:
                assert len(observation) == self.nb_inputs
                for idx,s in enumerate(observation):
                    input_batches[idx].append(s)

        rt = [np.array(x) for x in input_batches]

        return rt
    
multi_input_processor = Jiqian_MultiInputProcessor(3)

Using TensorFlow backend.


In [29]:
processed = multi_input_processor.process_state_batch(state0_batch)

In [30]:
len(processed)

3

In [64]:
class GraphicQNetworkKeras():
    def __init__(self, N,F, obs_space, action_space, num_outputs=3, model_config=None, name='graphic_policy_keras'):
        self.obs_space = obs_space
        self.action_space = action_space
        self.num_outputs = num_outputs
        self.name = name
        self.base_model = self.build_model(N,F,num_outputs)

    def build_model(self,N,F,num_outputs):
        X_in = Input(shape=(N,F), name='X_in')
        A_in = Input(shape=(N,N), name='A_in')
        RL_indice = Input(shape=(N), name='mask')

        ### Graphic convolution

        x = GraphConv(32, activation='relu',name='gcn1')([X_in, A_in])
        x = GraphConv(32, activation='relu',name='gcn2')([x, A_in])

        ### Policy network
        x1 = Dense(32,activation='relu',name='policy_1')(x)
        x2 = Dense(16,activation='relu',name='policy_2')(x1)

        ###  Action and filter
        x3 = Dense(num_outputs, activation='softmax',name='policy_3')(x2)
        mask = Reshape((N,1),name='expend_dim')(RL_indice)
        qout = Multiply(name='filter')([x3,mask])

        model = Model(inputs = [X_in,A_in,RL_indice], outputs=[qout])
        print(model.summary())
        return model
rl_model = GraphicQNetworkKeras(N,F, obs_space, act_space)

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
X_in (InputLayer)               [(None, 40, 9)]      0                                            
__________________________________________________________________________________________________
A_in (InputLayer)               [(None, 40, 40)]     0                                            
__________________________________________________________________________________________________
gcn1 (GraphConv)                (None, 40, 32)       320         X_in[0][0]                       
                                                                 A_in[0][0]                       
__________________________________________________________________________________________________
gcn2 (GraphConv)                (None, 40, 32)       1056        gcn1[0][0]                 

In [65]:
predicted = rl_model.base_model.predict(processed)

In [66]:
predicted.shape

(32, 40, 3)

In [67]:
processed[2][0].dtype

dtype('float64')

In [68]:
q_vals = predicted[0]
q_vals

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]], dtype=float32)

In [85]:
np.sum(np.any(q_vals, axis=1))

8

In [80]:
mask = processed[2][0]

num_agent = mask.sum().astype(int)
np.random.choice(np.array([0,1,2]),num_agent)

array([2, 1, 2, 0, 2, 0, 2, 2])

In [98]:
from rl.policy import Policy
class greedy_q_policy(Policy):
    def select_action(self,q_vals):
        action = None
        mask = np.any(q_vals, axis=1)
        if mask.sum() > 0:
            action = q_vals[mask,:].argmax(1)
        return action

class random_obs_policy(Policy):
    def select_action(self,observation):
        action = None
        _,_,mask = observation
        num_agent = mask.sum().astype(int)
        if num_agent>0:
            action = np.random.choice(np.arange(3),num_agent)
        return action

class eps_greedy_q_policy(Policy):
    def __init__(self, eps=.1):
        super(eps_greedy_q_policy, self).__init__()
        self.eps = eps

    def select_action(self,q_vals):
        action = None
        mask = np.any(q_vals, axis=1)
        num_agent = mask.sum().astype(int)
        if num_agent>0:
            if np.random.uniform() < self.eps:  # choose random action
                action = np.random.choice(np.arange(3),num_agent)
            else:
                action = q_vals[mask,:].argmax(1)
        return action

In [99]:
policy = eps_greedy_q_policy(0.3)

In [100]:
policy.select_action(q_vals)

array([1, 0, 0, 0, 2, 0, 0, 2])

In [91]:
q_vals

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]], dtype=float32)

In [52]:
%load_ext autoreload
%autoreload 2

In [53]:
from agents.dqn import DQNAgent