In [1]:
import numpy as np
import tensorflow as tf
import random

2023-10-11 23:48:36.518499: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# 1. Predict joint actions from central network given local states + global state [optional] -> Action_A, Action_B (action for agent A)
# 2. Agents perform the actions selected by the central network
# 3. Agents update their internal network by plugging in the original local state, plus the action the central network selected, to optimize its own prediction

# Central network: take in global state, output actions for all agents
# Local network: take in local state, output action for one agent

# Centralized Training:
# 	take in global state
# 	output q-values for actions for all agents
# 	agents take the action and go from state S->S'
# 	agents calculate the reward from state S'
# 	agents compute q-value for their individual actions taken 
# 	train model with updated q values 
# 	train local model with local state S with the same q-values as targets

# Q(s,a) = Q(s,a) + alpha[Reward(s,a) + gamma*maxQ(s', a') - Q(s,a)]

In [3]:
class Action():
    push = 0
    move = 1
    right = [0, 1]
    left = [0, -1]
    up = [-1, 0]
    down = [1, 0]
    no_op = [0, 0]
    action_list = [(push, left), (push, right), (push, up), (push, down), (push, no_op), (move, left), (move, right), (move, up), (move, down), (move, no_op)]
    
    def __init__(self, type, action):
        self.action_type = type
        self.action = action

    def __repr__(self):
        type = "Push" if self.action_type == self.push else "Move"
        return f"Action(type={type}, {self.action})"

    @classmethod
    def random(cls):
        action_type = random.choice([Action.push, Action.move])
        action = random.choice([Action.left, Action.right, Action.up, Action.down])
        action_type, action = random.choice(Action.action_list)
        return cls(action_type, action)

In [109]:
class Grid():
    def __init__(self):
        self.rows = 4
        self.cols = 4

        self.num_agents = 5
        
        self.goal = tuple(np.random.randint((self.rows, self.cols)))
        self.block = tuple(np.random.randint((self.rows, self.cols)))
        
        self.agent_list = []
        self.agent_positions = []
        
        self.action_shape = 10      

        self.epsilon = 0.1
        self.gamma = 0.1
        self.alpha = 0.1
        
        self.central_model = self._centralized_network()
        
    def add_agents(self):
        for i in range(self.num_agents):
            agent = Agent()
            agent.id = i
            agent.position = self.spawn_agent()
            agent.grid = grid
            self.agent_positions.append(agent.position)
            self.agent_list.append(agent)

    def spawn_agent(self):
        while True:
            position = tuple(np.random.randint((self.rows, self.cols)))
            if position not in self.agent_positions and position != self.goal and position != self.block:
                return position

    @property
    def local_states_shape(self):
        return (self.num_agents, self.rows, self.cols)

    def global_state(self):
        state = np.zeros((self.rows, self.cols))
        for agent in self.agent_list:
            state[agent.position[0]][agent.position[1]] = 1
        state[self.goal[0]][self.goal[1]] = -1
        state[self.block[0]][self.block[1]]= -2
         
        return state

    def local_states(self):
        local_states = []
        
        for agent in self.agent_list:
            local_states.append(agent.local_state())
        
        return np.array(local_states)
                    
    def _centralized_network(self):  
        y = x = tf.keras.layers.Input(self.local_states_shape) #Input: all local states of agents
        y = tf.keras.layers.Flatten()(y)
        y = tf.keras.layers.Dense(64)(y)
        y = tf.keras.layers.Dense(self.action_shape * self.num_agents)(y) 
        y = tf.keras.layers.Reshape((self.num_agents, self.action_shape))(y) #Output: actions for each agent
        
        model = tf.keras.Model(x, y)

        return model

    def calculate_reward(self, state, action):
        if self.block[0] == self.goal[0] and self.block[1] == self.goal[1]:
            return 1
        return 0
    
    def calculate_q_values(self, pred_q_values, rewards, future_q_values):
        q_values = []
        # Q(S,A) + lr[R(s,a) + gamma(MaxQ'(s',A')) - Q(S,A)]
        for i, agent in enumerate(self.agent_list):
            q_values.append(pred_q_values[i] + self.alpha * (rewards[i] + self.gamma * future_q_values[i] - pred_q_values[i]))
        return q_values
    
    def central_policy(self):
        calculated_q_values = []
        rewards = []
        previous_local_states = self.local_states()
        
        predicted_q_values = self.central_model(np.array([previous_local_states]))[0].numpy()
        
        max_q_values_indices = np.argmax(predicted_q_values, axis=1)
        max_q_values = predicted_q_values[np.arange(self.num_agents), max_q_values_indices]

        actions_list = [Action(*Action.action_list[x]) for x in max_q_values_indices]

        for i, agent in enumerate(self.agent_list):
            agent.previous_state = previous_local_states[i]

        for i, agent in enumerate(self.agent_list):
            if random.random() < self.epsilon:
                action = Action.random()
            else:
                action = actions_list[i]

            agent.take_action(action)
            agent.current_state = agent.local_state()

            rewards.append(self.calculate_reward(action, agent.current_state))
            
        current_local_states = self.local_states()
        
        future_q_values = self.central_model(np.array([current_local_states]))[0].numpy()
        future_max_q_values_indices = np.argmax(predicted_q_values, axis=1)
        future_max_q_values = predicted_q_values[np.arange(self.num_agents), max_q_values_indices]

        calculated_q_values = self.calculate_q_values(max_q_values, rewards, future_max_q_values)

        # self.central_model.fit(local_states, correct_q_values) #Train model with the correct q-values

        #     #Fit local model
        #     agent.local_model.fit(agent.local_state(), q_value)

In [110]:
class Agent():
    def __init__(self):
        self.id = 0
        self.position = 0
        self.epsilon = 0.1
        
        self.grid = []
        
        self.action_shape = 10      

        self.local_model = self._local_network() 
        
        self.current_state = []
        self.previous_state = []
        
    def __repr__(self):
        return f"Agent(id={self.id}, pos={self.position})"

    def take_action(self, action):        
        if action.action_type == 0:
            grid.block += np.array(action.action)
            grid.block %= [grid.rows, grid.cols]
        
        if action.action_type == 1:
            self.position += np.array(action.action)
            self.position %= [grid.rows, grid.cols]
                        
    def local_state(self):
        local = np.copy(grid.global_state())
        local[self.position] = 2
        return local

    def _local_network(self):        
        local_state = self.local_state()
        y = x = tf.keras.layers.Input(np.shape(local_state)) #Input: local state of agent
        y = tf.keras.layers.Dense(64)(y)
        y = tf.keras.layers.Dense(self.action_shape)(y) #Output: action for agent
        model = tf.keras.Model(x, y)
        
        return model
    
    def local_policy(self):        
        #Get the current state 
        current_state = self.get_state()
        
        #Take random action with some probability
        if random.random() < self.epsilon:
            return Action.random()
        #Otherwise pick the best action to take
        else:
            predicted_action_q_values = self.model.predict(current_state)
            best_action_q_value = np.max(predicted_action_q_values)
            action = Actions[best_action_q_value]

In [111]:
grid = Grid()

In [112]:
grid.global_state()

array([[ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0., -2.],
       [ 0.,  0., -1.,  0.],
       [ 0.,  0.,  0.,  0.]])

In [113]:
grid.add_agents()

In [114]:
grid.agent_list

[Agent(id=0, pos=(3, 3)),
 Agent(id=1, pos=(2, 3)),
 Agent(id=2, pos=(3, 0)),
 Agent(id=3, pos=(2, 0)),
 Agent(id=4, pos=(1, 1))]

In [115]:
grid.local_states()

array([[[ 0.,  0.,  0.,  0.],
        [ 0.,  1.,  0., -2.],
        [ 1.,  0., -1.,  1.],
        [ 1.,  0.,  0.,  2.]],

       [[ 0.,  0.,  0.,  0.],
        [ 0.,  1.,  0., -2.],
        [ 1.,  0., -1.,  2.],
        [ 1.,  0.,  0.,  1.]],

       [[ 0.,  0.,  0.,  0.],
        [ 0.,  1.,  0., -2.],
        [ 1.,  0., -1.,  1.],
        [ 2.,  0.,  0.,  1.]],

       [[ 0.,  0.,  0.,  0.],
        [ 0.,  1.,  0., -2.],
        [ 2.,  0., -1.,  1.],
        [ 1.,  0.,  0.,  1.]],

       [[ 0.,  0.,  0.,  0.],
        [ 0.,  2.,  0., -2.],
        [ 1.,  0., -1.,  1.],
        [ 1.,  0.,  0.,  1.]]])

In [116]:
grid.block

(1, 3)

In [117]:
grid.central_policy()

In [118]:
grid.agent_list

[Agent(id=0, pos=(3, 3)),
 Agent(id=1, pos=(2, 3)),
 Agent(id=2, pos=(3, 0)),
 Agent(id=3, pos=[2 3]),
 Agent(id=4, pos=(1, 1))]