In [1]:
import numpy as np
import tensorflow as tf
import random

2023-10-06 18:53:44.895626: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# 1. Predict joint actions from central network given local states + global state [optional] -> Action_A, Action_B (action for agent A)
# 2. Agents perform the actions selected by the central network
# 3. Agents update their internal network by plugging in the original local state, plus the action the central network selected, to optimize its own prediction

# Central network: take in global state, output actions for all agents
# Local network: take in local state, output action for one agent

# Centralized Training:
# 	take in global state
# 	output q-values for actions for all agents
# 	agents take the action and go from state S->S'
# 	agents calculate the reward from state S'
# 	agents compute q-value for their individual actions taken 
# 	train model with updated q values 
# 	train local model with local state S with the same q-values as targets

# Q(s,a) = Q(s,a) + alpha[Reward(s,a) + gamma*maxQ(s', a') - Q(s,a)]

In [3]:
class Action():
    push = 0
    move = 1
    right = [0, 1]
    left = [0, -1]
    up = [-1, 0]
    down = [1, 0]
    no_op = [0, 0]
    action_list = [(push, left), (push, right), (push, up), (push, down), (push, no_op), (move, left), (move, right), (move, up), (move, down), (move, no_op)]
    
    def __init__(self, type, action):
        self.action_type = type
        self.action = action

    @classmethod
    def random(cls):
        action_type = random.choice([Action.push, Action.move])
        action = random.choice([Action.left, Action.right, Action.up, Action.down])
        action_type, action = random.choice(Action.action_list)
        return cls(action_type, action)

In [4]:
class Grid():
    def __init__(self):
        self.rows = 4
        self.cols = 4
        
        self.goal = tuple(np.random.randint((self.rows, self.cols)))
        self.block = tuple(np.random.randint((self.rows, self.cols)))
        
        self.agent_list = []
        self.agent_positions = []
        
        self.action_shape = 10      

        self.epsilon = 0.1
        
        self.central_model = self._centralized_network()
        
    def add_agent(self, agent, grid):
        agent.id = self.num_agents
        agent.position = self.spawn_agent()
        agent.grid = grid
        self.agent_positions.append(agent.position)
        self.agent_list.append(agent)

    def spawn_agent(self):
        while True:
            position = tuple(np.random.randint((self.rows, self.cols)))
            if position not in self.agent_positions and position != self.goal and position != self.block:
                return position
                
    @property
    def num_agents(self):
        return len(self.agent_list)

    @property
    def local_states_shape(self):
        return np.shape(self.local_states())

    def global_state(self):
        state = np.zeros((self.rows, self.cols))
        for agent in self.agent_list:
            state[agent.position[0]][agent.position[1]] = 1
        state[self.goal[0]][self.goal[1]] = -1
        state[self.block[0]][self.block[1]]= -2
         
        return state

    def local_states(self):
        local_states = []
        if len(self.agent_list) > 0:
            for agent in self.agent_list:
                # local_states = np.concatenate((agent.local_state(), local_states)) #Ask about this
                local_states.append(agent.local_state())
            return local_states
        return []
        
    def _centralized_network(self):                
        y = x = tf.keras.layers.Input(np.shape(self.local_states_shape)) #Input: all local states of agents
        y = tf.keras.layers.Dense(64)(y)
        y = tf.keras.layers.Dense(self.action_shape *self.num_agents)(y) #Output: actions for each agent
        model = tf.keras.Model(x, y)

        return model

    def calculate_reward(self, action, current_state):
        return 0

    def calculate_q_value(self, action, current_state, previous_state):
        return 0

    def central_policy(self):
        calculated_q_values = []
        local_states = self.local_states()
        predicted_q_values = self.central_model.predict(local_states)
        max_q_values_indices = np.argmax(q_values, axis=1)
        max_q_values = predicted_q_values[np.arange(len(self.num_agents)), max_q_values_indices]
        actions_list = [Action.action_list[x] for x in max_q_values_indices]
        
        for agent in self.agent_list:
            #Convert max q-value to action
            agent.previous_state = agent.current_state
            
            #Take random action with some probability
            if random.random() < self.epsilon:
                action = Action.random()

            #Convert q-value index to action
            
            
            agent.take_action()
            
            agent.current_state = agent.local_state()
            reward = self.calculate_reward(action, agent.current_state)
            calculated_q_values.append(self.calculate_q_value())

            #Fit local model
            agent.local_model.fit(agent.local_state(), q_value)
        #Replace predicted max q-value with correct one
        self.central_model.fit(local_states, correct_q_values) #Train model with the correct q-values

In [5]:
class Agent():
    def __init__(self):
        self.id = 0
        self.position = 0
        self.epsilon = 0.1
        
        self.grid = []
        
        self.action_shape = 10      

        self.local_model = self._local_network() 
        
        self.current_state = []
        self.previous_state = []
        
    def __repr__(self):
        return f"Agent(id={self.id}, pos={self.position})"

    def take_action(self, action, grid):
        if action.action_type == 0:
            grid.block += np.array(action.action)
            grid.block %= [grid.rows, grid.cols]
        
        if action.action_type == 1:
            self.position += np.array(action.action)
            self.position %= [grid.rows, grid.cols]
            
    def local_state(self):
        local = np.copy(grid.global_state())
        local[self.position] = 2
        return local

    def _local_network(self):        
        local_state = self.local_state()
        y = x = tf.keras.layers.Input(np.shape(local_state)) #Input: local state of agent
        y = tf.keras.layers.Dense(64)(y)
        y = tf.keras.layers.Dense(self.action_shape)(y) #Output: action for agent
        model = tf.keras.Model(x, y)
        
        return model
    
    def local_policy(self):        
        #Get the current state 
        current_state = self.get_state()
        
        #Take random action with some probability
        if random.random() < self.epsilon:
            return Action.random()
        #Otherwise pick the best action to take
        else:
            predicted_action_q_values = self.model.predict(current_state)
            best_action_q_value = np.max(predicted_action_q_values)
            action = Actions[best_action_q_value]

In [6]:
grid = Grid()

In [7]:
for i in range(5):
    agent = Agent()
    grid.add_agent(agent, grid)

In [8]:
grid.global_state()

array([[ 1.,  1., -2., -1.],
       [ 1.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  1.,  0.,  0.]])

In [9]:
grid.central_policy()

ValueError: in user code:

    File "/opt/conda/lib/python3.11/site-packages/keras/engine/training.py", line 2169, in predict_function  *
        return step_function(self, iterator)
    File "/opt/conda/lib/python3.11/site-packages/keras/engine/training.py", line 2155, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/conda/lib/python3.11/site-packages/keras/engine/training.py", line 2143, in run_step  **
        outputs = model.predict_step(data)
    File "/opt/conda/lib/python3.11/site-packages/keras/engine/training.py", line 2111, in predict_step
        return self(x, training=False)
    File "/opt/conda/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/opt/conda/lib/python3.11/site-packages/keras/engine/input_spec.py", line 219, in assert_input_compatibility
        raise ValueError(

    ValueError: Layer "model" expects 1 input(s), but it received 5 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 4) dtype=float32>, <tf.Tensor 'IteratorGetNext:1' shape=(None, 4) dtype=float32>, <tf.Tensor 'IteratorGetNext:2' shape=(None, 4) dtype=float32>, <tf.Tensor 'IteratorGetNext:3' shape=(None, 4) dtype=float32>, <tf.Tensor 'IteratorGetNext:4' shape=(None, 4) dtype=float32>]
