In [19]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import numpy as np
import torch as torch
import torch.nn as nn
import torch.nn.functional as F
import random
from IPython.display import clear_output
from enum import Enum, auto
import yaml

In [30]:
# get config of qmix algorithim 
with open('qmix.yaml', 'r') as f:
    qmix_config = yaml.load(f, Loader=yaml.FullLoader)
# get config of environment
with open('env.yaml', 'r') as f:
    env_config = yaml.load(f, Loader=yaml.FullLoader)

In [32]:
qmix_config

{'action_selector': 'epsilon_greedy',
 'epsilon_start': 1.0,
 'epsilon_finish': 0.05,
 'epsilon_anneal_time': 50000,
 'runner': 'episode',
 'buffer_size': 5000,
 'target_update_interval': 200,
 'agent_output_type': 'q',
 'learner': 'q_learner',
 'double_q': True,
 'mixer': 'qmix',
 'mixing_embed_dim': 32,
 'hypernet_layers': 2,
 'hypernet_embed': 64,
 'name': 'qmix'}

In [31]:
env_config

{'rows': 6, 'cols': 6, 'num_agents': 4, 'name': 'block_problem'}

In [2]:
class Actions(Enum):
    NO_OP = 0
    MOVE_UP = auto()
    MOVE_DOWN = auto()
    MOVE_LEFT = auto()
    MOVE_RIGHT = auto()
    
    @property
    def delta(self):
        if self == self.NO_OP:
            return (0, 0)
        if self == self.MOVE_UP:
            return (-1, 0)
        if self == self.MOVE_DOWN:
            return (1, 0)
        if self == self.MOVE_LEFT:
            return (0, -1)
        if self == self.MOVE_RIGHT:
            return (0, 1)

    @property 
    def shape(self):
        return len(Actions)

    @property 
    def one_hot(self):
        return tf.one_hot(self.value, len(Actions))

In [3]:
class GridEnv():
    def __init__(self, rows, cols, num_agents):
        self.rows = rows
        self.cols = cols
        
        self.num_agents = num_agents
        self.agent_list = []
        
        self.goal = []
        self.block = []

    def qmix(self):
        agent_inputs = [agent.local_model.inputs for agent in self.agent_list]
        agent_outputs = [agent.local_model.output for agent in self.agent_list]
        global_state = tf.keras.layers.Input(batch_shape=(1, *np.shape(self.get_global_state())))

        q_values = tf.keras.layers.Concatenate()(agent_outputs)
        y = MixingLayer(grid)((q_values, global_state))
        qmix = tf.keras.Model((agent_inputs, global_state), (agent_outputs, y))
        qmix.compile(run_eagerly=True)
        
        return qmix

    def populate_grid(self):
        positions = np.random.choice(self.rows*self.cols, self.num_agents+2, replace=False)

        self.goal = (positions[0]//self.cols, positions[0]%self.cols)
        self.block = (positions[1]//self.cols, positions[1]%self.cols)

        for i in range(self.num_agents):
            agent = Agent(self, i)
            agent.position = (positions[i+2]//self.cols, positions[i+2]%self.cols)
            self.agent_list.append(agent)

    def get_global_state(self):
        return tf.one_hot([p[0]*self.cols+p[1] for p in (self.goal, self.block, *[a.position for a in self.agent_list])], self.rows*self.cols)
    
    def vizualize_grid(self): 
        grid = [list("."*self.cols) for _ in range(self.rows)]

        grid[self.goal[0]][self.goal[1]] = "G"
        grid[self.block[0]][self.block[1]]= "B"

        for i, agent in enumerate(self.agent_list):
            grid[agent.position[0]][agent.position[1]] = str(i)

        return '\n'.join([' '.join(row) for row in grid])

    def __repr__(self):
        return str(self.vizualize_grid())

In [4]:
class Agent():
    def __init__(self, grid, id):
        self.grid = grid

        self.id = id
        self.position = [0, 0]
        self.epsilon = 0.1
        self.gamma = 0.1
        self.alpha = 0.1
        
        self.local_model = self._build_local_model() 

        self.history = ([], [])
        self.previous_action = Actions.NO_OP

    def get_local_state(self):
        return self.grid.get_global_state()

    def take_action(self, action):
        future_position = self.position
        future_position += np.array(action.delta)
        future_position %= [self.grid.rows, self.grid.cols]

        if list(future_position) not in [a.position for a in self.grid.agent_list]:
            if list(future_position) == self.grid.block:
                self.grid.block += np.array(action.delta)
                self.grid.block %= [self.grid.rows, self.grid.cols]
            else:
                self.position = future_position
            
    def get_reward(self):
        if self.grid.block == self.grid.goal:
            return 0
        else:
            return -1

    def get_correct_qvalue(self, pred_max_value, reward, current_max_value):
        q_value = pred_max_value + self.alpha * (reward + self.gamma * current_max_value - pred_max_value)
        return q_value
                
    def _build_local_model(self):
        local_state = tf.keras.layers.Input(batch_shape=(1, None, self.grid.num_agents+2, self.grid.rows * self.grid.cols), name=f"Local_State{self.id}") #Input: local state
        prev_action = tf.keras.layers.Input(batch_shape=(1, None, len(Actions)), name=f"Prev_Action{self.id}") #Input: previous action
        y = tf.keras.layers.Reshape((-1, (self.grid.num_agents+2) * self.grid.rows * self.grid.cols))(local_state)
        y = tf.keras.layers.Concatenate(axis=-1)([y, prev_action])
        y = tf.keras.layers.Dense(64, activation="relu")(y)
        y = tf.keras.layers.GRU(64, stateful=False)(y)
        y = tf.keras.layers.Dense(len(Actions))(y) #Output: q value of actions

        model = tf.keras.Model((local_state, prev_action), y) # outputs hidden state tensor->pass back in as input return_state=true
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss=tf.keras.losses.Huber())
        
        return model
    
    def step(self):
        current_state = self.get_local_state()

        self.history[0].append(current_state)
        self.history[1].append(self.previous_action.one_hot)
        
        pred_qvalues = self.local_model((np.array([self.history[0]]), np.array([self.history[1]])))[0].numpy() #placeholder action
        pred_max_index = np.argmax(pred_qvalues)

        if random.random() < self.epsilon:
           action = random.choice(list(Actions))
        else:
            action = Actions(pred_max_index)
            
        self.take_action(action)
        self.previous_action = action
        future_state = self.get_local_state()
        
        reward = self.get_reward()
        
        future_qvalues  = self.local_model((np.array([self.history[0]+[future_state]]), np.array([self.history[1]+[action.one_hot]])))[0].numpy()
        future_max_index = np.argmax(future_qvalues)

        target_qvalue = self.get_correct_qvalue(pred_qvalues[pred_max_index], reward, future_qvalues[future_max_index])
        target_qvalues = pred_qvalues.copy()
        target_qvalues[pred_max_index] = target_qvalue

        self.grid.q_values[self.id] = target_qvalue

        self.local_model.fit((np.array([self.history[0]]), np.array([self.history[1]])), np.array([target_qvalues]), verbose=0) 

In [5]:
# class MixingLayer(tf.keras.layers.Layer):
#     def __init__(self, grid, **kwargs):
#         super().__init__(**kwargs)
#         self.grid = grid
        
#         self.hyper_w1 = tf.keras.layers.Dense(grid.num_agents*64, use_bias=False) 
#         self.hyper_b1 = tf.keras.layers.Dense(64, use_bias=False)
        
#         self.hyper_w2 = tf.keras.layers.Dense(64, use_bias=False)
#         self.hyper_b2 = tf.keras.layers.Dense(64, use_bias=False)
#         self.hyper_b2_final = tf.keras.layers.Dense(1, use_bias=False)

#     def call(self, inputs):
#         # q_values = tf.transpose(tf.reshape(inputs[0], (1, self.grid.num_agents, -1)), (0, 2, 1))
#         q_values = tf.transpose(tf.reshape(inputs[0], (1, self.grid.num_agents, -1)), (0, 1, 2))

#         state_dim = inputs[0].shape[-1]
#         global_state = tf.reshape(inputs[1], (1, -1))

#         w1 = self.hyper_w1(global_state) # take in global state to produce 1st layer weights
#         w1 = tf.math.abs(w1) # take absolute value
#         w1 = tf.reshape(w1, (1, grid.num_agents, 64))
#         b1 = self.hyper_b1(global_state) # add the bias
#         print(w1.shape, b1.shape, q_values.shape)
#         output_w1 = tf.keras.activations.elu(tf.matmul(q_values, w1) + b1)
#         print("layer 1", output_w1.shape)
        
#         w2 = self.hyper_w2(global_state)
#         w2 = tf.math.abs(w2)
#         b2 = tf.keras.activations.relu(self.hyper_b2(global_state))
#         b2 = self.hyper_b2_final(b2)

#         print("layer1 reshape", output_w1.shape)
#         print("layer 2", w2.shape)
        
#         joint_q = tf.matmul(output_w1, w2, transpose_b=True) + b2
#         print("joint q", joint_q.shape)
#         joint_q = tf.reshape(joint_q, (1, -1))
#         print("joint q output", joint_q.shape)
#         return joint_q


In [6]:
class QMixer(nn.Module):
    def __init__(self, n_agents, state_shape, mixing_embed_dim, hyper_embed):
        super(QMixer, self).__init__()

        # self.args = args
        self.n_agents = n_agents
        self.state_dim = int(np.prod(state_shape))

        self.embed_dim = mixing_embed_dim

        # if getattr("hypernet_layers", 1) == 1:
        #     self.hyper_w_1 = nn.Linear(self.state_dim, self.embed_dim * self.n_agents)
        #     self.hyper_w_final = nn.Linear(self.state_dim, self.embed_dim)
        # elif getattr(args,"hypernet_layers", 1) == 2:
        hypernet_embed = hyper_embed
        self.hyper_w_1 = nn.Sequential(nn.Linear(self.state_dim, hypernet_embed),
                                       nn.ReLU(),
                                       nn.Linear(hypernet_embed, self.embed_dim * self.n_agents))
        self.hyper_w_final = nn.Sequential(nn.Linear(self.state_dim, hypernet_embed),
                                       nn.ReLU(),
                                       nn.Linear(hypernet_embed, self.embed_dim))
        # elif getattr(args, "hypernet_layers", 1) > 2:
        #     raise Exception("Sorry >2 hypernet layers is not implemented!")
        # else:
        #     raise Exception("Error setting number of hypernet layers.")

        # State dependent bias for hidden layer
        self.hyper_b_1 = nn.Linear(self.state_dim, self.embed_dim)

        # V(s) instead of a bias for the last layers
        self.V = nn.Sequential(nn.Linear(self.state_dim, self.embed_dim),
                               nn.ReLU(),
                               nn.Linear(self.embed_dim, 1))

    def forward(self, agent_qs, states):
        bs = agent_qs.size(0)
        states = states.reshape(-1, self.state_dim)
        agent_qs = agent_qs.view(-1, 1, self.n_agents)
        # First layer
        w1 = th.abs(self.hyper_w_1(states))
        b1 = self.hyper_b_1(states)
        w1 = w1.view(-1, self.n_agents, self.embed_dim)
        b1 = b1.view(-1, 1, self.embed_dim)
        print(w1.shape, b1.shape)
        hidden = F.elu(th.bmm(agent_qs, w1) + b1)
        # Second layer
        w_final = th.abs(self.hyper_w_final(states))
        w_final = w_final.view(-1, self.embed_dim, 1)
        # State-dependent bias
        v = self.V(states).view(-1, 1, 1)
        # Compute final output
        y = th.bmm(hidden, w_final) + v
        # Reshape and return
        q_tot = y.view(bs, -1, 1)
        return q_tot

NameError: name 'nn' is not defined

In [None]:
n_agents = 2
agent_qs = torch.Tensor(np.random.normal(size=(1, n_agents)))
states = torch.Tensor(np.random.normal(size=(1, 6, 6)))

In [None]:
mixer = QMixer(2, (6, 6), 32, 64)

In [None]:
mixer.forward(agent_qs, states)

In [None]:
import torch as th
import torch.nn as nn
import torch.nn.functional as F


In [None]:
grid.get_global_state()

In [None]:
global_state = torch.Tensor(grid.get_global_state().numpy())
state_dim = np.prod(global_state.numpy().shape)
state_dim

In [None]:
n_agents = grid.num_agents
n_agents

In [None]:
embed_dim = 64
hypernet_embed = 2*embed_dim

hyper_w1 = nn.Sequential(
    nn.Linear(state_dim, hypernet_embed),
    nn.ReLU(),
    nn.Linear(hypernet_embed, embed_dim * n_agents))

In [None]:
agent_qs = torch.Tensor(np.random.normal(size=(5, 1, n_agents)))
agent_qs.shape

In [None]:
states = global_state.view((-1, state_dim))
states.shape
states = torch.flatten(states).view((1, -1))
states.shape

In [None]:
w1 = torch.abs(hyper_w1(states))
w1 = w1.view(-1, n_agents, embed_dim)
w1.shape

In [7]:
torch.bmm(w1, agent_qs.transpose(1, 0)).shape

NameError: name 'torch' is not defined

In [8]:
class MixingLayer(tf.keras.layers.Layer):
    def __init__(self, grid, **kwargs):
        super().__init__(**kwargs)
        self.grid = grid
        self.state_dim = int(np.prod(np.shape(grid.get_global_state())))
        self.embed_dim = 64
        
        self.hyper_w1 = tf.keras.Sequential([
            tf.keras.layers.Input(self.state_dim),
            tf.keras.layers.Dense(self.embed_dim),
            tf.keras.layers.Activation("relu"),
            tf.keras.layers.Dense(self.embed_dim*self.grid.num_agents)
        ])

        self.hyper_w2 = tf.keras.Sequential([
            tf.keras.layers.Input(self.state_dim),
            tf.keras.layers.Dense(self.embed_dim),
            tf.keras.layers.Activation("relu"),
            tf.keras.layers.Dense(self.embed_dim)
        ])

        self.hyper_b1 = tf.keras.layers.Dense(self.embed_dim, input_shape=[self.state_dim])
        
        self.hyper_b2 = tf.keras.Sequential([
            tf.keras.layers.Input(self.state_dim),
            tf.keras.layers.Dense(self.embed_dim),
            tf.keras.layers.Activation("relu"),
            tf.keras.layers.Dense(1)
        ]) 

    ''' def forward(self, agent_qs, states):
        bs = agent_qs.size(0)
        states = states.reshape(-1, self.state_dim)
        
        agent_qs = agent_qs.view(-1, 1, self.n_agents)
        
        # First layer
        w1 = th.abs(self.hyper_w_1(states))
        b1 = self.hyper_b_1(states)
        
        w1 = w1.view(-1, self.n_agents, self.embed_dim)
        
        b1 = b1.view(-1, 1, self.embed_dim)
        
        hidden = F.elu(th.bmm(agent_qs, w1) + b1)
        
        # Second layer
        w_final = th.abs(self.hyper_w_final(states))
        w_final = w_final.view(-1, self.embed_dim, 1)
        # State-dependent bias
        v = self.V(states).view(-1, 1, 1)
        # Compute final output
        y = th.bmm(hidden, w_final) + v
        # Reshape and return
        q_tot = y.view(bs, -1, 1)
        return q_tot'''
    
    def call(self, inputs):
        # q_values = tf.reshape(inputs[0], (1, self.grid.num_agents, -1)) # 1, 2, 5
        q_values = tf.transpose(tf.reshape(inputs[0], (1, self.grid.num_agents, -1)), (2, 0, 1))
        global_state = tf.reshape(inputs[1] , (-1, self.state_dim)) # 1, 144

        w1 = self.hyper_w1(global_state)
        w1 = tf.math.abs(w1) 
        w1 = tf.reshape(w1, (1, grid.num_agents, 64)) 

        b1 = self.hyper_b1(global_state)
        
        b1 = tf.reshape(b1, (-1, 1, 64))
        print(w1.shape, b1.shape)

        layer1 = tf.matmul(q_values, w1)

        # layer1 = tf.keras.activations.elu(tf.matmul(q_values, w1) + b1)

        print(layer1.shape)
        
        # w2 = self.hyper_w2(global_state)
        # w2 = tf.math.abs(w2)
        # b2 = tf.keras.activations.relu(self.hyper_b2(global_state))
        # b2 = self.hyper_b2_final(b2)

        # joint_q = tf.matmul(output_w1, w2, transpose_b=True) + b2
        # joint_q = tf.reshape(joint_q, (1, -1))
        # return joint_q
        return 0 


In [9]:
rows = 6
cols = 6
num_agents = 2

grid = GridEnv(rows, cols, num_agents)
grid.populate_grid()
qmix = grid.qmix()

(1, 2, 64) (1, 1, 64)
(5, 1, 64)


In [10]:
global_state = grid.get_global_state()
agent_states = []
for agent in grid.agent_list:
    agent_states.append(np.array([[agent.get_local_state()]]))
    agent_states.append(np.array([[agent.previous_action.one_hot]]))

In [11]:
q_values, q_total = qmix((agent_states, global_state))

(1, 2, 64) (1, 1, 64)
(5, 1, 64)


In [12]:
q_values

[<tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[-0.00277368,  0.05317054,  0.17464086,  0.02114644,  0.13932592]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[-0.05265313,  0.04974331,  0.04907609, -0.02351041, -0.03261544]],
       dtype=float32)>]

In [13]:
q_total

0