In [4]:
import numpy as np
import tensorflow as tf
import random
from IPython.display import clear_output

2023-10-20 09:12:32.825788: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
# 1. Predict joint actions from central network given local states + global state [optional] -> Action_A, Action_B (action for agent A)
# 2. Agents perform the actions selected by the central network
# 3. Agents update their internal network by plugging in the original local state, plus the action the central network selected, to optimize its own prediction

# Central network: take in global state, output actions for all agents
# Local network: take in local state, output action for one agent

# Centralized Training:
# 	take in global state
# 	output q-values for actions for all agents
# 	agents take the action and go from state S->S'
# 	agents calculate the reward from state S'
# 	agents compute q-value for their individual actions taken 
# 	train model with updated q values 
# 	train local model with local state S with the same q-values as targets

# Q(s,a) = Q(s,a) + alpha[Reward(s,a) + gamma*maxQ(s', a') - Q(s,a)]

In [6]:
class Action():
    push = 0
    move = 1
    right = [0, 1]
    left = [0, -1]
    up = [-1, 0]
    down = [1, 0]
    no_op = [0, 0]
    action_list = [(push, left), (push, right), (push, up), (push, down), (push, no_op), (move, left), (move, right), (move, up), (move, down), (move, no_op)]
    
    def __init__(self, type, action):
        self.action_type = type
        self.action = action

    def __repr__(self):
        type = "Push" if self.action_type == self.push else "Move"
        return f"Action(type={type}, {self.action})"

    @classmethod
    def random(cls):
        action_type = random.choice([Action.push, Action.move])
        action = random.choice([Action.left, Action.right, Action.up, Action.down])
        action_type, action = random.choice(Action.action_list)
        return cls(action_type, action)

In [7]:
class Agent():
    def __init__(self):
        self.id = 0
        self.position = 0
        
        self.epsilon = 0.1
        self.gamma = 0.1
        self.alpha = 0.1
        
        self.grid = []
        
        self.action_shape = 10      

        self.local_model = self._local_network() 

        self.previous_state = []
        
    def __repr__(self):
        return f"Agent(id={self.id}, pos={self.position})"

    def take_action(self, action):        
        if action.action_type == 0:
            grid.block += np.array(action.action)
            grid.block %= [grid.rows, grid.cols]
        
        if action.action_type == 1:
            self.position += np.array(action.action)
            self.position %= [grid.rows, grid.cols]
                        
    def local_state(self):
        local = np.copy(grid.state_template)
        local[self.position] = 1
        local = local.flatten()
        return local

    def _local_network(self):        
        y = x = tf.keras.layers.Input(grid.global_state_shape) #Input: global state 
        y = tf.keras.layers.Flatten()(y)
        y = tf.keras.layers.Dense(64, activation="tanh")(y)
        y = tf.keras.layers.Dense(self.action_shape)(y) #Output: action for agent
        model = tf.keras.Model(x, y)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss=tf.keras.losses.Huber())
        
        return model

    def calculate_reward(self, grid):
        if grid.block[0] == grid.goal[0] and grid.block[1] == grid.goal[1]:
            return 0
        else:
            return -1
            
    def calculate_q_value(self, pred_q_value, reward, future_q_value):
        q_value = pred_q_value + self.alpha * (reward + self.gamma * future_q_value - pred_q_values)
        return q_value

    def local_policy(self):        
        #Get the previous state 
        previous_state = self.local_state()

        predicted_q_values = self.local_model(np.array([previous_state]))[0].numpy()
        updated_q_values = np.copy(predicted_q_values)
        
        max_q_value_index = np.argmax(predicted_q_values, axis=1)
        max_q_value = predicted_q_values[max_q_values_index]

        if random.random() < self.epsilon:
            return Action.random()
        else:
            action = Action(*Action.action_list[max_q_values_index])

        self.take_action(action)

        reward = self.calculate_reward()

        future_q_values = self.local_model(np.array([self.current_state]))[0].numpy()
        future_max_q_value_index = np.argmax(future_q_values, axis=1)
        future_max_q_value = future_q_values[future_max_q_value_index]

        calculated_q_value = self.calculate_q_value(max_q_value, reward, future_max_q_value)
        updated_q_values[max_q_values_index] = calculated_q_value

In [8]:
class Grid():
    def __init__(self):
        self.rows = 4
        self.cols = 4
        self.state_template = np.zeros((self.rows, self.cols))
        
        self.num_agents = 1
        
        self.goal = tuple(np.random.randint((self.rows, self.cols)))
        self.block = tuple(np.random.randint((self.rows, self.cols)))
        
        self.agent_list = []
        self.agent_positions = []
        
        self.action_shape = 10      

        self.epsilon = 0.1
        self.gamma = 0.1
        
        self.central_model = self._centralized_network()
        
    def add_agents(self):
        for i in range(self.num_agents):
            agent = Agent()
            agent.id = i
            agent.position = self.spawn_agent()
            agent.grid = grid
            self.agent_positions.append(agent.position)
            self.agent_list.append(agent)

    def spawn_agent(self):
        while True:
            position = tuple(np.random.randint((self.rows, self.cols)))
            if position not in self.agent_positions and position != self.goal and position != self.block:
                return position

    @property
    def global_state_shape(self):
        return (self.num_agents+2, self.rows*self.cols)
    
    def view_global_state(self):
        state = np.zeros((self.rows, self.cols))
        for agent in self.agent_list:
            state[agent.position[0]][agent.position[1]] = 1
            state[self.goal[0]][self.goal[1]] = -1
            state[self.block[0]][self.block[1]]= -2

        return state

    def set_goal_state(self):
        goal_state = np.copy(self.state_template)
        goal_state[*self.goal] = 1
        return goal_state.flatten()
    
    def set_block_state(self):
        block_state = np.copy(self.state_template)
        block_state[*self.block] = 1
        return block_state.flatten()
    
    def global_state(self):        
        agent_states = self.agent_states()
        goal_state = self.set_goal_state()
        block_state = self.set_block_state()

        global_state = np.stack((*agent_states, goal_state, block_state))
        return np.array(global_state)

    def agent_states(self):
        agent_states = []
        
        for agent in self.agent_list:
            agent_states.append(agent.local_state())
        
        return np.array(agent_states)

    def train_local(self, q_values):
        for i, agent in enumerate(self.agent_list):
            agent.local_model.fit(np.array([agent.previous_state]), np.array([q_values[i,:]]), verbose=0)
                    
    def _centralized_network(self):  
        y = x = tf.keras.layers.Input(self.global_state_shape) #Input: all local states of agents
        y = tf.keras.layers.Flatten()(y)
        y = tf.keras.layers.Dense(64, activation="tanh")(y)
        y = tf.keras.layers.Dense(self.action_shape * self.num_agents)(y) 
        y = tf.keras.layers.Reshape((self.num_agents, self.action_shape))(y) #Output: actions for each agent
        
        model = tf.keras.Model(x, y)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss=tf.keras.losses.Huber(delta=1.0))

        return model

    def calculate_rewards(self, previous_block, current_block):
        rewards = []
        previous_diff = np.sum(np.abs(np.array(self.goal) - np.array(previous_block)))
        current_diff = np.sum(np.abs(np.array(self.goal) - np.array(current_block)))

        distance_covered = current_diff - previous_diff
        
        for agent in self.agent_list:
            if self.block[0] == self.goal[0] and self.block[1] == self.goal[1]:
                rewards.append(0 + distance_covered)
            else:
                rewards.append(+1 + distance_covered)
        return rewards
    
    def calculate_q_values(self, pred_q_values, rewards, future_q_values):
        q_values = []
        for i, agent in enumerate(self.agent_list):
            q_values.append(rewards[i] + self.gamma * future_q_values[i])
        return q_values

    #change to value function learning
    def central_policy(self):
        calculated_q_values = []
        rewards = []
        updated_q_values = []
        previous_global_state = self.global_state()
        
        predicted_q_values = self.central_model(np.array([previous_global_state]))[0].numpy()
        updated_q_values = np.copy(predicted_q_values)
        
        max_q_values_indices = np.argmax(predicted_q_values, axis=1)
        max_q_values = predicted_q_values[np.arange(self.num_agents), max_q_values_indices]

        actions_list = [Action(*Action.action_list[x]) for x in max_q_values_indices]

        for i, agent in enumerate(self.agent_list):
            agent.previous_state = previous_global_state

        previous_block = self.block
        
        #random permutation each time
        for i, agent in enumerate(self.agent_list):
            if random.random() < self.epsilon:
                action = Action.random()
            else:
                action = actions_list[i]

            #add tiebreaker
            agent.take_action(action)
            
        current_block = self.block
        
        #reward for next state
        rewards = self.calculate_rewards(previous_block, current_block)
            
        current_global_state = self.global_state()
        
        future_q_values = self.central_model(np.array([current_global_state]))[0].numpy()
        future_max_q_values_indices = np.argmax(future_q_values, axis=1)
        future_max_q_values = future_q_values[np.arange(self.num_agents), future_max_q_values_indices]

        calculated_q_values = self.calculate_q_values(max_q_values, rewards, future_max_q_values)
        updated_q_values[np.arange(self.num_agents), max_q_values_indices] = calculated_q_values

        self.central_model.fit(np.array([previous_global_state]), np.array([updated_q_values]), verbose=0) #Train model with the correct q-values
        
        # Fit local models
        self.train_local(updated_q_values)

        clear_output(wait=True)
        print(self.view_global_state())

    def reset(self):
        self.goal = tuple(np.random.randint((self.rows, self.cols)))
        self.block = tuple(np.random.randint((self.rows, self.cols)))
                    
        for i in range(self.num_agents):
            self.agent_list[i].position = self.spawn_agent()

    def episode(self, move_limit=100):
        self.reset()
        moves = 0
        while tuple(self.block) != self.goal and moves < move_limit:
            self.central_policy()   
            moves += 1
            if tuple(self.block) == self.goal:
                print("Goal Reached!")
        
    def train(self, num_episodes=1, move_limit=50):
        for i in range(num_episodes):
            print("Episode", i)
            self.episode(move_limit)

In [12]:
grid = Grid()

In [13]:
grid.add_agents()

In [None]:
grid.train(10**12)

[[ 0.  0.  0. -1.]
 [-2.  1.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]
