This part is reproduction of a previous work of past year student, Timothy Yeo

With 2 Data Center, each with 5 machines and 6 queue length.
total feature nubmer is 2 * (5 * 2+6 * 3+2) = 60

## Single-Agent Baseline

In [223]:
# create a NN network with following architecture
# Input Layer (60 features)
# Leaky ReLU activation
# Hidden Layer 1 (512 features)
# Leaky ReLU activation
# Hidden Layer 2 (256 features)
# Leaky ReLU activation
# Hidden Layer 3 (256 features)
# Leaky ReLU activation
# Hidden Layer 3 (256 features)
# Leaky ReLU activation
# Output Layer (4 actions)

import numpy as np
import torch
import torch.nn as nn

class SingleAgentNN(nn.Module):
    def __init__(self):
        super(SingleAgentNN, self).__init__()
        self.layer1 = nn.Linear(60, 512)
        self.layer2 = nn.Linear(512, 256)
        self.layer3 = nn.Linear(256, 256)
        self.layer4 = nn.Linear(256, 256)
        self.layer5 = nn.Linear(256, 4)

    def __str__(self):
        return f'Neural Network with input layer {self.input_layer}, hidden layer 1 {self.hidden_layer_1}, hidden layer 2 {self.hidden_layer_2}, hidden layer 3 {self.hidden_layer_3}, hidden layer 4 {self.hidden_layer_4}, and output layer {self.output_layer}'

    def __repr__(self):
        return self.__str__()
    
    def forward_pass(self, input_data):
        x = self.layer1(input_data)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer2(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer3(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer4(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer5(x)
        return x

Above are implementation of a previous student's work.
Following will be the work by myself.

# Data Center

In [1]:
STATE_SIZE = 28
QUERY_SIZE = 1
VALUE_SIZE = 4
# second config: no information is passed
VALUE_SIZE = 1


JOB_SIZE = 2

In [2]:
# import necessary libraries
import torch
import torch.nn as nn

In [3]:
# Each data center holds two NN, one for state compression (Attention-model) and another for action selection (DQN), the following class represents the Attention model

# It has access to the full state of the data center and state representation of neighboring data centers

# State Compression Using Attention Model
class StateCompressor(nn.Module):
    def __init__(self, state_size, query_size, value_size, device="cpu"):
        super(StateCompressor, self).__init__()
        # might replace into a more appropriate encoder
        self.W_v_local = nn.Linear(state_size, value_size).to(device)
        # attention qkv
        self.W_q = nn.Linear(value_size, query_size).to(device)
        # value size add 1 to include extra encoding for the local state distingushment
        self.W_k = nn.Linear(value_size+1, query_size).to(device)
        self.W_v = nn.Linear(value_size+1, value_size).to(device)

        self.device = device

    def __str__(self):
        return ""
    
    def __repr__(self):
        return self.__str__()
    
    def forward_pass(self, local_state, remote_info):
        local_info = self.W_v_local(local_state.detach())
        full_info = torch.cat((local_info.detach().unsqueeze(0), remote_info.detach()), 0)
        full_info_with_encoding = torch.cat((full_info, torch.zeros(full_info.size(0), 1).to(self.device)), 1)
        full_info_with_encoding[0, -1] = 1


        # print("full_info", full_info.size())
        q = self.W_q(local_info)
        k = self.W_k(full_info_with_encoding)
        v = self.W_v(full_info_with_encoding)
        # print(q.size(), k.size(), v.size())
        x = torch.matmul(q, k.T)
        # print(x.size())
        x = torch.nn.functional.softmax(x)
        x = torch.matmul(x, v)

        return local_info + x

In [4]:
# brief test
state_compressor = StateCompressor(STATE_SIZE, QUERY_SIZE, VALUE_SIZE)
print(state_compressor.forward_pass(torch.zeros(STATE_SIZE), torch.zeros((2,VALUE_SIZE))))

tensor([0.3812], grad_fn=<AddBackward0>)


  x = torch.nn.functional.softmax(x)


In [5]:
# action selection using DQN

class DQN(nn.Module):
    def __init__(self, state_size, rep_size, device="cpu"):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(state_size+rep_size+JOB_SIZE+1, 512).to(device)
        self.layer2 = nn.Linear(512, 256).to(device)
        self.layer3 = nn.Linear(256, 256).to(device)
        self.layer4 = nn.Linear(256, 256).to(device)
        self.layer5 = nn.Linear(256, 1).to(device)
    
    def forward_pass(self, x):
        x = self.layer1(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer2(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer3(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer4(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer5(x)
        return x

In [6]:
# brief test
dqn = DQN(STATE_SIZE, VALUE_SIZE)
print(dqn.forward_pass(torch.zeros(STATE_SIZE + VALUE_SIZE + 2 + 1)))

tensor([-0.0462], grad_fn=<ViewBackward0>)


With 2 Data Center, each with 5 machines and 6 queue length.
total feature nubmer is 2 * (5 * 2+6 * 3+2) = 60

which means that per machine, we have 5*2+6*3 = 28 state size

In [7]:
class DataCenter():
    def __init__(self, device):
        # self.data_center_id = data_center_id
        # self.machine_num = machine_num
        # self.queue_num = queue_num
        self.state = torch.zeros(STATE_SIZE).to(device)
        self.compressor = StateCompressor(STATE_SIZE, QUERY_SIZE, VALUE_SIZE, device=device)
        self.dqn = DQN(STATE_SIZE, VALUE_SIZE)
        self.representations = torch.zeros(VALUE_SIZE).to(device)

        self.device = device

        self.dqn_optimizer = torch.optim.Adam(self.dqn.parameters(), lr=0.001)
        self.compressor_optimizer = torch.optim.Adam(self.compressor.parameters(), lr=0.001)
    
    def update(self, delta):
        with torch.no_grad():
            # reward = gains from successful job allocation - losses from queueing delay
            reward = torch.tensor(0.0).to(self.device)
            # Separate machine states and queue states
            machines = self.state[:10].view(5, 2).clone()
            queues = self.state[10:].view(6, 3).clone()

            # Update machine states
            machines[:, 1] = torch.maximum(torch.zeros_like(machines[:, 1]), machines[:, 1] - delta)
            machines[machines[:, 1] == 0, 0] = 0

            # Find available machines and assign jobs from the queue
            for i in range(queues.size(0)):
                if queues[i, 0] > 0:
                    # Find first available machine
                    available_machine_index = torch.nonzero(machines[:, 0] == 0, as_tuple=False)
                    if available_machine_index.size(0) > 0:
                        first_available = available_machine_index[0].item()
                        machines[first_available, 0] = 1
                        machines[first_available, 1] = queues[i, 1]
                        reward += queues[i, 2]
                        queues[i, :] = 0
                else:
                    break
            # move remaining jobs to the front
            queues = torch.cat((queues[queues[:, 0] > 0], queues[queues[:, 0] == 0]), 0)
            

            
            # queues[:, 2] = torch.maximum(torch.zeros_like(queues[:, 2]), queues[:, 2] - 0.1)
            queues[:, 2] *= 0.9

            # Merge the updated machine and queue states back into self.state
            self.state = torch.cat((machines.view(-1), queues.view(-1)))

            return reward
    
    def update_rep(self, remote_info):
        new_reps = self.compressor.forward_pass(self.state, remote_info)
        # print(new_reps.size(), self.representations.size())
        assert new_reps.size() == self.representations.size()
        self.representations = new_reps
    
    def get_q_values(self, reps, job):
        batch_input = torch.zeros((2, STATE_SIZE + VALUE_SIZE + JOB_SIZE + 1))
        batch_input[0] = torch.cat((self.state, self.representations, job, torch.ones(1).to(self.device)), 0)
        for i in range(reps.size(0)):
            batch_input[i+1] = torch.cat((self.state, reps[i], job, torch.zeros(1).to(self.device)), 0)
            # expand the concat into several instructions
        
        # batch_input[0, :STATE_SIZE] = self.state
        # # batch_input[0, STATE_SIZE:STATE_SIZE+VALUE_SIZE] = self.representations
        # batch_input[0, STATE_SIZE+VALUE_SIZE:STATE_SIZE+VALUE_SIZE+JOB_SIZE] = job

        batch_input.to(self.device)
        q_values = self.dqn.forward_pass(batch_input)
        return q_values

    # add job to the queue of the data center
    def add_job(self, job):
        reward = 0
        state = self.state.clone()
        for i in range(6):
            if state[10+i*3] == 0:
                state[10+i*3] = 1
                state[10+i*3+1] = job[0]
                state[10+i*3+2] = job[1]
                break
        else:
            reward -= 0.2
        self.state = state
        return reward


    
    # how to do this?
    def backprop(self):
        self.dqn_optimizer.step()
        self.compressor_optimizer.step()

        self.dqn_optimizer.zero_grad()
        self.compressor_optimizer.zero_grad()

In [8]:
# test
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

dc = DataCenter(device)
dc.state[0] = 1
dc.state[1] = 4
dc.state[2] = 1
dc.state[3] = 1
dc.state[10] = 1
dc.state[11] = 5
dc.state[12] = 12
dc.state[13] = 1
dc.state[14] = 4
dc.state[15] = 1

dc.update(0.1)
print(dc.state)

remote_info = torch.zeros((2, VALUE_SIZE)).to(device)
dc.update_rep(remote_info)
print(dc.representations)

tensor([1.0000, 3.9000, 1.0000, 0.9000, 1.0000, 5.0000, 1.0000, 4.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0')
tensor([-0.1790], device='cuda:0', grad_fn=<AddBackward0>)


  x = torch.nn.functional.softmax(x)


In [9]:
class JobGenerator():
    def __init__(self, data_center_num) -> None:
        self.underlying_state = torch.randint(0, 2, (data_center_num,))
        self.data_center_num = data_center_num

    def generate_job(self):
        jobs = []
        for i in range(self.data_center_num):
            if torch.rand(1).item() < 0.02:
                self.underlying_state[i] = 1 - i
                
            seed = torch.rand(1).item()
            if self.underlying_state[i] == 1:
                # choose high workload
                if seed < 0.4:
                    jobs.append((10, 1.0))
                elif seed < 0.7:
                    jobs.append((6, 0.6))
                else:
                    jobs.append((4, 0.4))
            else:
                # choose low workload
                if seed < 0.3:
                    jobs.append((4, 0.4))
                elif seed < 0.7:
                    jobs.append((3, 0.3))
                else:
                    jobs.append((2, 0.2))
        return [torch.tensor(j) for j in jobs]

In [10]:
# test
job_generator = JobGenerator(2)
print(job_generator.generate_job())

[tensor([2.0000, 0.2000]), tensor([3.0000, 0.3000])]


# Validity check

To investigate that there is no trivial solution for this task.

In [226]:
device = torch.device("cpu")
total_reward = 0
total_neg_reward = 0

dataCenter = DataCenter(device)
jobGenerator = JobGenerator(1)
for i in range(2000):
    job = jobGenerator.generate_job()[0]
    reward = dataCenter.add_job(job)
    total_neg_reward += reward
    reward += dataCenter.update(1)
    total_reward += reward

print(total_neg_reward)

dataCenter = DataCenter(device)
jobGenerator = JobGenerator(1)
for i in range(2000):
    job = jobGenerator.generate_job()[0]
    reward = dataCenter.add_job(job)
    reward += dataCenter.update(1)
    total_reward += reward

print(total_reward)

-113.00000000000107
tensor(740.3980)


# Utility Definitions

In [190]:
def epsilon_greedy(q_values, epsilon):
    action = None
    if torch.rand(1).item() < epsilon:
        action = torch.randint(0, q_values.size(0), (1, ))
    else:
        action = torch.argmax(q_values)
    q_value = q_values[action]
    return action, q_value

In [272]:
import random

class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def push(self, state, rep, job, action, reward, next_state, next_rep, next_job):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, rep, job, action, reward, next_state, next_rep, next_job)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        # zip into torch tensors
        # return zip(*random.sample(self.buffer, batch_size))
        batch = random.sample(self.buffer, min(batch_size, len(self.buffer)))
        state, rep, job, action, reward, next_state, next_rep, next_job = zip(*batch)
        return torch.stack(list(state)), torch.stack(list(rep)), torch.stack(list(job)), torch.stack(list(action)), torch.stack(list(reward)), torch.stack(list(next_state)), torch.stack(list(next_rep)), torch.stack(list(next_job))

    def __len__(self):
        return len(self.buffer)

In [273]:
# sample test

replay_buffer = ReplayBuffer(100)
replay_buffer.push(torch.zeros(STATE_SIZE), torch.zeros(VALUE_SIZE), torch.zeros(JOB_SIZE), torch.zeros(1), torch.zeros(1), torch.zeros(STATE_SIZE), torch.zeros(VALUE_SIZE), torch.zeros(JOB_SIZE))
replay_buffer.push(torch.ones(STATE_SIZE), torch.ones(VALUE_SIZE), torch.ones(JOB_SIZE), torch.zeros(1), torch.zeros(1), torch.ones(STATE_SIZE), torch.ones(VALUE_SIZE), torch.ones(JOB_SIZE))

print(replay_buffer.sample(3))

(tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]), tensor([[1.],
        [0.]]), tensor([[1., 1.],
        [0., 0.]]), tensor([[0.],
        [0.]]), tensor([[0.],
        [0.]]), tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]), tensor([[1.],
        [0.]]), tensor([[1., 1.],
        [0., 0.]]))


# Main Run

In [264]:
BATCH_SIZE = 256

## Pure DQN

In [368]:
# Main Simulation

# configs
torch.autograd.set_detect_anomaly(True)
EPSILON = 1


# Check if GPU is available and if so, use it
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(f"Using device: {device}")

# Create Data Centers
data_center_num = 2
dataCenter1 = DataCenter(device)
dataCenter2 = DataCenter(device)


model = SingleAgentNN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

if False:
    print("trained model loaded from file")
    model = torch.load("SingleAgentBaseline.pth")
    EPSILON = 0.95**60
else:
    print("model initialized randomly")

replay_buffer = ReplayBuffer(200000)
dummy_value = torch.zeros(1).to(device)


M = 120 #500
N = 2000 #3000


for episode in range(M):
    EPSILON *= 0.95
    # keep track of rewards
    total_rewards = 0
    actions_record = torch.zeros((2,))

    # create a loss function
    dataCenter1.state = torch.zeros(STATE_SIZE).to(device)
    dataCenter2.state = torch.zeros(STATE_SIZE).to(device)
    total_rewards = 0

    jobGenerator = JobGenerator(data_center_num)

    jobs = jobGenerator.generate_job()

    # used for replay buffer
    curr_state = torch.cat((dataCenter1.state, dataCenter2.state, jobs[0].to(device), jobs[1].to(device)), 0)

    # get initial actions
    q_values = model.forward_pass(curr_state)
    action, q_value = epsilon_greedy(q_values, EPSILON)

    for timestep in range(N):
        # update according to action

        reward1 = 0
        reward2 = 0

        reward = torch.tensor(0.0).to(device)

        reward_from_1 = 0
        reward_from_2 = 0

        if action % 2 == 0:
            reward_from_1 += dataCenter1.add_job(jobs[0])
        else:
            jobs[0][1] *= 0.8
            reward_from_2 += dataCenter2.add_job(jobs[0])

        if action // 2 == 0:
            reward_from_2 +=  dataCenter2.add_job(jobs[1])
        else:
            jobs[1][1] *= 0.8
            reward_from_1 += dataCenter1.add_job(jobs[1])
        actions_record[action % 2 ] += 1
        actions_record[action // 2] += 1


        # print("state before update:", dataCenter1.state, q_value_1)

        reward_from_1 += dataCenter1.update(1)
        reward_from_2 += dataCenter2.update(1)

        reward1 = reward_from_1 * 0.5 + reward_from_2 * 0.5
        reward2 = reward_from_2 * 0.5 + reward_from_1 * 0.5
        
        reward = reward1 + reward2

        jobs = jobGenerator.generate_job()

        next_state = torch.cat((dataCenter1.state, dataCenter2.state, jobs[0].to(device), jobs[1].to(device)), 0)

        total_rewards += reward

        ############################## update replay buffer ##############################

        replay_buffer.push(curr_state, dummy_value, jobs[0].to(device), action.view(-1).to(device), reward1, next_state, dummy_value, jobs[1].to(device))
        # print(curr_state, dummy_value, jobs[0].to(device), action, reward1, next_state, dummy_value, jobs[1].to(device))
        sample_state, _, _, sample_action, sample_reward, sample_next_state, _, _ = replay_buffer.sample(BATCH_SIZE)
        replay_actual_q_values = model.forward_pass(sample_state.detach())[torch.arange(sample_state.size(0)), sample_action.view(-1)]
        replay_next_q_values = model.forward_pass(sample_next_state.detach())
        # print(torch.max(replay_next_q_values, 1).size())
        replay_expected_values = sample_reward + 0.95 * torch.max(replay_next_q_values, 1)[0]
        loss = torch.nn.MSELoss()(replay_expected_values.detach(), replay_actual_q_values)
        loss.backward(retain_graph=True)


        optimizer.step()
        optimizer.zero_grad()

        curr_state = torch.cat((dataCenter1.state, dataCenter2.state, jobs[0].to(device), jobs[1].to(device)), 0)
        # get next actions
        q_values = model.forward_pass(curr_state)
        action, q_value = epsilon_greedy(q_values, EPSILON)


        # print(cur_state)
    print("we got ", total_rewards, "total reward", " and actions", actions_record, EPSILON*200)
    print(total_rewards, dataCenter1.state, dataCenter2.state, action, reward, q_values)

    #print(total_rewards, l_time, r_time)

Using device: cuda:0
model initialized randomly
we got  tensor(1326.1486, device='cuda:0') total reward  and actions tensor([2056., 1944.]) 190.0
tensor(1326.1486, device='cuda:0') tensor([1.0000, 1.0000, 1.0000, 3.0000, 1.0000, 2.0000, 1.0000, 1.0000, 1.0000,
        5.0000, 1.0000, 3.0000, 0.1944, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0') tensor([ 1.0000,  3.0000,  1.0000,  1.0000,  1.0000,  4.0000,  1.0000,  4.0000,
         1.0000, 10.0000,  1.0000,  3.0000,  0.1771,  1.0000,  3.0000,  0.1968,
         1.0000, 10.0000,  0.5832,  1.0000,  4.0000,  0.2880,  1.0000,  3.0000,
         0.2700,  0.0000,  0.0000,  0.0000], device='cuda:0') tensor([2]) tensor(0.5249, device='cuda:0') tensor([10.5733,  9.9677, 10.5469, 10.4242], device='cuda:0',
       grad_fn=<ViewBackward0>)
we got  tensor(1256.8341, device='cuda:0') total reward  and actions tensor([2018., 1982.]) 180.5
tensor(

In [293]:
if None:
    print("saving the model!")
    torch.save(model, "SingleAgentBaseline.pth")
else:
    print("not saving the model to avoid overwriting")

not saving the model to avoid overwriting


## Independent Learners
Now we provide a baseline of independent learners

We further examined two cases:

1. With reward sharing: reward for each job is shared between two agents equally. Converge to ~1450
2. Without reward sharing: reward for finishing each job is shared only with local agent. Converge to ~

### with reward sharing

In [315]:
# define IL DQN model
import torch
import torch.nn as nn

class IndependentLearnerNN(nn.Module):
    def __init__(self):
        super(IndependentLearnerNN, self).__init__()
        self.layer1 = nn.Linear(30, 512)
        self.layer2 = nn.Linear(512, 256)
        self.layer3 = nn.Linear(256, 256)
        self.layer4 = nn.Linear(256, 256)
        self.layer5 = nn.Linear(256, 2)

    def __str__(self):
        return f'Neural Network with input layer {self.input_layer}, hidden layer 1 {self.hidden_layer_1}, hidden layer 2 {self.hidden_layer_2}, hidden layer 3 {self.hidden_layer_3}, hidden layer 4 {self.hidden_layer_4}, and output layer {self.output_layer}'

    def __repr__(self):
        return self.__str__()
    
    def forward_pass(self, input_data):
        x = self.layer1(input_data)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer2(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer3(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer4(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer5(x)
        return x

# Main Simulation

# configs
torch.autograd.set_detect_anomaly(True)
EPSILON = 1

# Check if GPU is available and if so, use it
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(f"Using device: {device}")

if True:
    # Create Data Centers
    data_center_num = 2
    dataCenter1 = DataCenter(device)
    dataCenter2 = DataCenter(device)

    model_1 = IndependentLearnerNN().to(device)
    model_2 = IndependentLearnerNN().to(device)

    optimizer_1 = torch.optim.Adam(model_1.parameters(), lr=0.001)
    optimizer_2 = torch.optim.Adam(model_2.parameters(), lr=0.001)

    # Initialize Replay Buffer
    replay_buffer_1 = ReplayBuffer(200000)
    replay_buffer_2 = ReplayBuffer(200000)
    dummy_value = torch.zeros(1).to(device)
else:
    EPSILON = 0.95**60


# Hyperparameters
M = 100 # Number of episodes
N = 2000 # Number of timesteps per episode

for episode in range(M):
    EPSILON *= 0.95
    # keep track of rewards
    total_rewards = 0
    actions_record = torch.zeros((2,))

    # Reset states
    dataCenter1.state = torch.zeros(STATE_SIZE).to(device)
    dataCenter2.state = torch.zeros(STATE_SIZE).to(device)
    total_rewards = 0

    jobGenerator = JobGenerator(data_center_num)

    jobs = jobGenerator.generate_job()

    curr_state_1 = torch.cat((torch.zeros(STATE_SIZE).to(device), jobs[0].to(device)), 0)
    curr_state_2 = torch.cat((torch.zeros(STATE_SIZE).to(device), jobs[1].to(device)), 0)

    # Get initial actions
    q_values_1 = model_1.forward_pass(curr_state_1)
    q_values_2 = model_2.forward_pass(curr_state_2)

    action1, q_value_1 = epsilon_greedy(q_values_1, EPSILON)
    action2, q_value_2 = epsilon_greedy(q_values_2, EPSILON)

    print("initial actions", action1, action2)

    for timestep in range(N):
        reward1 = 0
        reward2 = 0
        reward = torch.tensor(0.0).to(device)
        reward_from_1 = 0
        reward_from_2 = 0

        if action1 == 0:
            reward_from_1 += dataCenter1.add_job(jobs[0])
        else:
            jobs[0][1] *= 0.8
            reward_from_2 += dataCenter2.add_job(jobs[0])

        if action2 == 0:
            reward_from_2 += dataCenter2.add_job(jobs[1])
        else:
            jobs[1][1] *= 0.8
            reward_from_1 += dataCenter1.add_job(jobs[1])
        
        # print(action1, actions_record)
        actions_record[action1] += 1
        actions_record[action2] += 1

        reward_from_1 += dataCenter1.update(1)
        reward_from_2 += dataCenter2.update(1)

        reward1 = reward_from_1 * 0.5 + reward_from_2 * 0.5
        reward2 = reward_from_2 * 0.5 + reward_from_1 * 0.5
        
        reward = reward1 + reward2

        jobs = jobGenerator.generate_job()

        next_state_1 = torch.cat((dataCenter1.state, jobs[0].to(device)), 0)
        next_state_2 = torch.cat((dataCenter2.state, jobs[1].to(device)), 0)

        total_rewards += reward

        ############################## update replay buffer ##############################

        replay_buffer_1.push(curr_state_1, dummy_value, jobs[0].to(device), action1.view(-1).to(device), reward1, next_state_1, dummy_value, jobs[1].to(device))
        replay_buffer_2.push(curr_state_2, dummy_value, jobs[1].to(device), action2.view(-1).to(device), reward2, next_state_2, dummy_value, jobs[0].to(device))

        sample_state_1, _, _, sample_action_1, sample_reward_1, sample_next_state_1, _, _ = replay_buffer_1.sample(BATCH_SIZE)
        sample_state_2, _, _, sample_action_2, sample_reward_2, sample_next_state_2, _, _ = replay_buffer_2.sample(BATCH_SIZE)

        replay_actual_q_values_1 = model_1.forward_pass(sample_state_1.detach())[torch.arange(sample_state_1.size(0)), sample_action_1.view(-1)]
        replay_actual_q_values_2 = model_2.forward_pass(sample_state_2.detach())[torch.arange(sample_state_2.size(0)), sample_action_2.view(-1)]

        replay_next_q_values_1 = model_1.forward_pass(sample_next_state_1.detach())
        replay_next_q_values_2 = model_2.forward_pass(sample_next_state_2.detach())

        replay_expected_values_1 = sample_reward_1 + 0.95 * torch.max(replay_next_q_values_1, 1)[0]
        replay_expected_values_2 = sample_reward_2 + 0.95 * torch.max(replay_next_q_values_2, 1)[0]

        loss_1 = torch.nn.MSELoss()(replay_expected_values_1.detach(), replay_actual_q_values_1)
        loss_2 = torch.nn.MSELoss()(replay_expected_values_2.detach(), replay_actual_q_values_2)

        loss_1.backward(retain_graph=True)
        loss_2.backward(retain_graph=True)

        optimizer_1.step()
        optimizer_2.step()

        optimizer_1.zero_grad()
        optimizer_2.zero_grad()

        ############################## get next actions ##############################
        # Update states and actions

        curr_state_1 = torch.cat((dataCenter1.state, jobs[0].to(device)), 0)
        curr_state_2 = torch.cat((dataCenter2.state, jobs[1].to(device)), 0)

        q_values_1 = model_1.forward_pass(curr_state_1)
        q_values_2 = model_2.forward_pass(curr_state_2)
        action1, q_value_1 = epsilon_greedy(q_values_1, EPSILON)
        action2, q_value_2 = epsilon_greedy(q_values_2, EPSILON)

    print("we got ", total_rewards, "total reward")
    print("actions", actions_record, EPSILON * 2000)
    print(total_rewards, dataCenter1.state, dataCenter2.state, action1, action2, reward, q_values_1, q_values_2)


Using device: cuda:0
initial actions tensor(0, device='cuda:0') tensor(0, device='cuda:0')
we got  tensor(1489.5868, device='cuda:0') total reward
actions tensor([2172., 1828.]) 87.53261807520869
tensor(1489.5868, device='cuda:0') tensor([1., 7., 1., 2., 1., 8., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], device='cuda:0') tensor([ 1.,  6.,  1., 10.,  1.,  3.,  1.,  6.,  1.,  3.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       device='cuda:0') tensor(0, device='cuda:0') tensor(0, device='cuda:0') tensor(1.6760, device='cuda:0') tensor([7.9454, 7.7798], device='cuda:0', grad_fn=<ViewBackward0>) tensor([8.7113, 7.9592], device='cuda:0', grad_fn=<ViewBackward0>)
initial actions tensor(1, device='cuda:0') tensor(1, device='cuda:0')
we got  tensor(1424.6742, device='cuda:0') total reward
actions tensor([2268., 1732.]) 83.15598717144826
tensor(1424.6742, device='cuda:0') tensor([1.00

KeyboardInterrupt: 

### no reward sharing

In [316]:
# define IL DQN model
import torch
import torch.nn as nn

class IndependentLearnerNN(nn.Module):
    def __init__(self):
        super(IndependentLearnerNN, self).__init__()
        self.layer1 = nn.Linear(30, 512)
        self.layer2 = nn.Linear(512, 256)
        self.layer3 = nn.Linear(256, 256)
        self.layer4 = nn.Linear(256, 256)
        self.layer5 = nn.Linear(256, 2)

    def __str__(self):
        return f'Neural Network with input layer {self.input_layer}, hidden layer 1 {self.hidden_layer_1}, hidden layer 2 {self.hidden_layer_2}, hidden layer 3 {self.hidden_layer_3}, hidden layer 4 {self.hidden_layer_4}, and output layer {self.output_layer}'

    def __repr__(self):
        return self.__str__()
    
    def forward_pass(self, input_data):
        x = self.layer1(input_data)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer2(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer3(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer4(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer5(x)
        return x

# Main Simulation

# configs
torch.autograd.set_detect_anomaly(True)
EPSILON = 1

# Check if GPU is available and if so, use it
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(f"Using device: {device}")

if True:
    # Create Data Centers
    data_center_num = 2
    dataCenter1 = DataCenter(device)
    dataCenter2 = DataCenter(device)

    model_1 = IndependentLearnerNN().to(device)
    model_2 = IndependentLearnerNN().to(device)

    optimizer_1 = torch.optim.Adam(model_1.parameters(), lr=0.001)
    optimizer_2 = torch.optim.Adam(model_2.parameters(), lr=0.001)

    # Initialize Replay Buffer
    replay_buffer_1 = ReplayBuffer(200000)
    replay_buffer_2 = ReplayBuffer(200000)
    dummy_value = torch.zeros(1).to(device)
else:
    EPSILON = 0.95**60


# Hyperparameters
M = 100 # Number of episodes
N = 2000 # Number of timesteps per episode

for episode in range(M):
    EPSILON *= 0.95
    # keep track of rewards
    total_rewards = 0
    actions_record = torch.zeros((2,))

    # Reset states
    dataCenter1.state = torch.zeros(STATE_SIZE).to(device)
    dataCenter2.state = torch.zeros(STATE_SIZE).to(device)
    total_rewards = 0

    jobGenerator = JobGenerator(data_center_num)

    jobs = jobGenerator.generate_job()

    curr_state_1 = torch.cat((torch.zeros(STATE_SIZE).to(device), jobs[0].to(device)), 0)
    curr_state_2 = torch.cat((torch.zeros(STATE_SIZE).to(device), jobs[1].to(device)), 0)

    # Get initial actions
    q_values_1 = model_1.forward_pass(curr_state_1)
    q_values_2 = model_2.forward_pass(curr_state_2)

    action1, q_value_1 = epsilon_greedy(q_values_1, EPSILON)
    action2, q_value_2 = epsilon_greedy(q_values_2, EPSILON)

    print("initial actions", action1, action2)

    for timestep in range(N):
        reward1 = 0
        reward2 = 0
        reward = torch.tensor(0.0).to(device)
        reward_from_1 = 0
        reward_from_2 = 0

        if action1 == 0:
            reward_from_1 += dataCenter1.add_job(jobs[0])
        else:
            jobs[0][1] *= 0.8
            reward_from_2 += dataCenter2.add_job(jobs[0])

        if action2 == 0:
            reward_from_2 += dataCenter2.add_job(jobs[1])
        else:
            jobs[1][1] *= 0.8
            reward_from_1 += dataCenter1.add_job(jobs[1])
        
        # print(action1, actions_record)
        actions_record[action1] += 1
        actions_record[action2] += 1

        reward_from_1 += dataCenter1.update(1)
        reward_from_2 += dataCenter2.update(1)

        reward1 = reward_from_1
        reward2 = reward_from_2
        
        reward = reward1 + reward2

        jobs = jobGenerator.generate_job()

        next_state_1 = torch.cat((dataCenter1.state, jobs[0].to(device)), 0)
        next_state_2 = torch.cat((dataCenter2.state, jobs[1].to(device)), 0)

        total_rewards += reward

        ############################## update replay buffer ##############################

        replay_buffer_1.push(curr_state_1, dummy_value, jobs[0].to(device), action1.view(-1).to(device), reward1, next_state_1, dummy_value, jobs[1].to(device))
        replay_buffer_2.push(curr_state_2, dummy_value, jobs[1].to(device), action2.view(-1).to(device), reward2, next_state_2, dummy_value, jobs[0].to(device))

        sample_state_1, _, _, sample_action_1, sample_reward_1, sample_next_state_1, _, _ = replay_buffer_1.sample(BATCH_SIZE)
        sample_state_2, _, _, sample_action_2, sample_reward_2, sample_next_state_2, _, _ = replay_buffer_2.sample(BATCH_SIZE)

        replay_actual_q_values_1 = model_1.forward_pass(sample_state_1.detach())[torch.arange(sample_state_1.size(0)), sample_action_1.view(-1)]
        replay_actual_q_values_2 = model_2.forward_pass(sample_state_2.detach())[torch.arange(sample_state_2.size(0)), sample_action_2.view(-1)]

        replay_next_q_values_1 = model_1.forward_pass(sample_next_state_1.detach())
        replay_next_q_values_2 = model_2.forward_pass(sample_next_state_2.detach())

        replay_expected_values_1 = sample_reward_1 + 0.95 * torch.max(replay_next_q_values_1, 1)[0]
        replay_expected_values_2 = sample_reward_2 + 0.95 * torch.max(replay_next_q_values_2, 1)[0]

        loss_1 = torch.nn.MSELoss()(replay_expected_values_1.detach(), replay_actual_q_values_1)
        loss_2 = torch.nn.MSELoss()(replay_expected_values_2.detach(), replay_actual_q_values_2)

        loss_1.backward(retain_graph=True)
        loss_2.backward(retain_graph=True)

        optimizer_1.step()
        optimizer_2.step()

        optimizer_1.zero_grad()
        optimizer_2.zero_grad()

        ############################## get next actions ##############################
        # Update states and actions

        curr_state_1 = torch.cat((dataCenter1.state, jobs[0].to(device)), 0)
        curr_state_2 = torch.cat((dataCenter2.state, jobs[1].to(device)), 0)

        q_values_1 = model_1.forward_pass(curr_state_1)
        q_values_2 = model_2.forward_pass(curr_state_2)
        action1, q_value_1 = epsilon_greedy(q_values_1, EPSILON)
        action2, q_value_2 = epsilon_greedy(q_values_2, EPSILON)

    print("we got ", total_rewards, "total reward")
    print("actions", actions_record, EPSILON * 2000)
    print(total_rewards, dataCenter1.state, dataCenter2.state, action1, action2, reward, q_values_1, q_values_2)


Using device: cuda:0
initial actions tensor([0]) tensor([0])
we got  tensor(1321.8208, device='cuda:0') total reward
actions tensor([2029., 1971.]) 1900.0
tensor(1321.8208, device='cuda:0') tensor([1.0000, 3.0000, 1.0000, 3.0000, 1.0000, 8.0000, 1.0000, 7.0000, 1.0000,
        1.0000, 1.0000, 4.0000, 0.3600, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000], device='cuda:0') tensor([ 1.0000,  8.0000,  1.0000,  2.0000,  1.0000,  2.0000,  1.0000,  2.0000,
         1.0000,  1.0000,  1.0000,  4.0000,  0.2362,  1.0000, 10.0000,  0.5249,
         1.0000,  6.0000,  0.3499,  1.0000,  4.0000,  0.2592,  1.0000,  2.0000,
         0.1800,  0.0000,  0.0000,  0.0000], device='cuda:0') tensor([1]) tensor([1]) tensor(0.2160, device='cuda:0') tensor([5.9406e+11, 5.2835e+11], device='cuda:0', grad_fn=<ViewBackward0>) tensor([7.8034, 8.0016], device='cuda:0', grad_fn=<ViewBackward0>)
initial actions tensor([0]) tensor([

KeyboardInterrupt: 

## State-Compression

In [327]:
# define IL DQN model
import torch
import torch.nn as nn

class StateCompressionNN(nn.Module):
    def __init__(self):
        super(StateCompressionNN, self).__init__()
        self.layer1 = nn.Linear(34, 512)
        self.layer2 = nn.Linear(512, 256)
        self.layer3 = nn.Linear(256, 256)
        self.layer4 = nn.Linear(256, 256)
        self.layer5 = nn.Linear(256, 2)

    def __str__(self):
        return f'Neural Network with input layer {self.input_layer}, hidden layer 1 {self.hidden_layer_1}, hidden layer 2 {self.hidden_layer_2}, hidden layer 3 {self.hidden_layer_3}, hidden layer 4 {self.hidden_layer_4}, and output layer {self.output_layer}'

    def __repr__(self):
        return self.__str__()
    
    def forward_pass(self, input_data):
        x = self.layer1(input_data)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer2(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer3(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer4(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer5(x)
        return x

# Main Simulation

# configs
torch.autograd.set_detect_anomaly(True)
EPSILON = 1

# Check if GPU is available and if so, use it
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(f"Using device: {device}")

if False:
    # Create Data Centers
    data_center_num = 2
    dataCenter1 = DataCenter(device)
    dataCenter2 = DataCenter(device)

    model_1 = StateCompressionNN().to(device)
    model_2 = StateCompressionNN().to(device)

    optimizer_1 = torch.optim.Adam(model_1.parameters(), lr=0.001)
    optimizer_2 = torch.optim.Adam(model_2.parameters(), lr=0.001)

    # Initialize Replay Buffer
    replay_buffer_1 = ReplayBuffer(200000)
    replay_buffer_2 = ReplayBuffer(200000)
    dummy_value = torch.zeros(1).to(device)
else:
    EPSILON = 0.95**100


def compressed_state(tensor):
    machines = tensor[:10].view(5, 2).clone()
    queues = tensor[10:].view(6, 3).clone()
    return torch.cat((machines.sum(0), queues.sum(0)[:2]), 0)

    

# Hyperparameters
M = 100 # Number of episodes
N = 2000 # Number of timesteps per episode

for episode in range(M):
    EPSILON *= 0.95
    # keep track of rewards
    total_rewards = 0
    actions_record = torch.zeros((2,))

    # Reset states
    dataCenter1.state = torch.zeros(STATE_SIZE).to(device)
    dataCenter2.state = torch.zeros(STATE_SIZE).to(device)
    total_rewards = 0

    jobGenerator = JobGenerator(data_center_num)

    jobs = jobGenerator.generate_job()

    curr_state_1 = torch.cat((dataCenter1.state.to(device), jobs[0].to(device), compressed_state(dataCenter2.state)), 0)
    curr_state_2 = torch.cat((dataCenter2.state.to(device), jobs[1].to(device), compressed_state(dataCenter1.state)), 0)

    # Get initial actions
    q_values_1 = model_1.forward_pass(curr_state_1)
    q_values_2 = model_2.forward_pass(curr_state_2)

    action1, q_value_1 = epsilon_greedy(q_values_1, EPSILON)
    action2, q_value_2 = epsilon_greedy(q_values_2, EPSILON)

    print("initial actions", action1, action2)

    for timestep in range(N):
        reward1 = 0
        reward2 = 0
        reward = torch.tensor(0.0).to(device)
        reward_from_1 = 0
        reward_from_2 = 0

        if action1 == 0:
            reward_from_1 += dataCenter1.add_job(jobs[0])
        else:
            jobs[0][1] *= 0.8
            reward_from_2 += dataCenter2.add_job(jobs[0])

        if action2 == 0:
            reward_from_2 += dataCenter2.add_job(jobs[1])
        else:
            jobs[1][1] *= 0.8
            reward_from_1 += dataCenter1.add_job(jobs[1])
        
        # print(action1, actions_record)
        actions_record[action1] += 1
        actions_record[action2] += 1

        reward_from_1 += dataCenter1.update(1)
        reward_from_2 += dataCenter2.update(1)

        reward1 = reward_from_1 * 0.5 + reward_from_2 * 0.5
        reward2 = reward_from_2 * 0.5 + reward_from_1 * 0.5
        
        reward = reward1 + reward2

        jobs = jobGenerator.generate_job()

        next_state_1 = torch.cat((dataCenter1.state, jobs[0].to(device), compressed_state(dataCenter2.state)), 0)
        next_state_2 = torch.cat((dataCenter2.state, jobs[1].to(device), compressed_state(dataCenter1.state)), 0)

        total_rewards += reward

        ############################## update replay buffer ##############################

        replay_buffer_1.push(curr_state_1, dummy_value, jobs[0].to(device), action1.view(-1).to(device), reward1, next_state_1, dummy_value, jobs[1].to(device))
        replay_buffer_2.push(curr_state_2, dummy_value, jobs[1].to(device), action2.view(-1).to(device), reward2, next_state_2, dummy_value, jobs[0].to(device))

        sample_state_1, _, _, sample_action_1, sample_reward_1, sample_next_state_1, _, _ = replay_buffer_1.sample(BATCH_SIZE)
        sample_state_2, _, _, sample_action_2, sample_reward_2, sample_next_state_2, _, _ = replay_buffer_2.sample(BATCH_SIZE)

        replay_actual_q_values_1 = model_1.forward_pass(sample_state_1.detach())[torch.arange(sample_state_1.size(0)), sample_action_1.view(-1)]
        replay_actual_q_values_2 = model_2.forward_pass(sample_state_2.detach())[torch.arange(sample_state_2.size(0)), sample_action_2.view(-1)]

        replay_next_q_values_1 = model_1.forward_pass(sample_next_state_1.detach())
        replay_next_q_values_2 = model_2.forward_pass(sample_next_state_2.detach())

        replay_expected_values_1 = sample_reward_1 + 0.95 * torch.max(replay_next_q_values_1, 1)[0]
        replay_expected_values_2 = sample_reward_2 + 0.95 * torch.max(replay_next_q_values_2, 1)[0]

        loss_1 = torch.nn.MSELoss()(replay_expected_values_1.detach(), replay_actual_q_values_1)
        loss_2 = torch.nn.MSELoss()(replay_expected_values_2.detach(), replay_actual_q_values_2)

        loss_1.backward(retain_graph=True)
        loss_2.backward(retain_graph=True)

        optimizer_1.step()
        optimizer_2.step()

        optimizer_1.zero_grad()
        optimizer_2.zero_grad()

        ############################## get next actions ##############################
        # Update states and actions

        curr_state_1 = torch.cat((dataCenter1.state.to(device), jobs[0].to(device), compressed_state(dataCenter2.state)), 0)
        curr_state_2 = torch.cat((dataCenter2.state.to(device), jobs[1].to(device), compressed_state(dataCenter1.state)), 0)

        q_values_1 = model_1.forward_pass(curr_state_1)
        q_values_2 = model_2.forward_pass(curr_state_2)
        action1, q_value_1 = epsilon_greedy(q_values_1, EPSILON)
        action2, q_value_2 = epsilon_greedy(q_values_2, EPSILON)

    print("we got ", total_rewards, "total reward")
    print("actions", actions_record, EPSILON * 2000)
    print(total_rewards, dataCenter1.state, dataCenter2.state, action1, action2, reward, q_values_1, q_values_2)


Using device: cuda:0


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x34 and 30x512)

## Action Advantage

In [12]:
# define IL DQN model
import torch
import torch.nn as nn

class AdvantageMARL(nn.Module):
    def __init__(self):
        super(AdvantageMARL, self).__init__()
        self.layer1 = nn.Linear(30, 256)
        self.layer2 = nn.Linear(256, 128)
        self.layer3 = nn.Linear(128, 128)
        self.layer4 = nn.Linear(128, 128)
        self.layer5 = nn.Linear(128, 2)

    def __str__(self):
        return f'Neural Network with input layer {self.input_layer}, hidden layer 1 {self.hidden_layer_1}, hidden layer 2 {self.hidden_layer_2}, hidden layer 3 {self.hidden_layer_3}, hidden layer 4 {self.hidden_layer_4}, and output layer {self.output_layer}'

    def __repr__(self):
        return self.__str__()
    
    def forward_pass(self, input_data):
        x = self.layer1(input_data)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer2(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer3(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer4(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer5(x)
        return x

# Main Simulation

# configs
torch.autograd.set_detect_anomaly(True)
EPSILON = 1

# Check if GPU is available and if so, use it
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(f"Using device: {device}")

if True:
    # Create Data Centers
    data_center_num = 2
    dataCenter1 = DataCenter(device)
    dataCenter2 = DataCenter(device)

    model_1 = AdvantageMARL().to(device)
    model_2 = AdvantageMARL().to(device)

    # for double-DQN
    model_1_target = AdvantageMARL().to(device)
    model_2_target = AdvantageMARL().to(device)

    advantage_model_1 = AdvantageMARL().to(device)
    advantage_model_2 = AdvantageMARL().to(device)

    optimizer_1 = torch.optim.Adam(model_1.parameters(), lr=0.001)
    optimizer_2 = torch.optim.Adam(model_2.parameters(), lr=0.001)
    optimizer_advantage_1 = torch.optim.Adam(advantage_model_1.parameters(), lr=0.001)
    optimizer_advantage_2 = torch.optim.Adam(advantage_model_2.parameters(), lr=0.001)

    # Initialize Replay Buffer
    replay_buffer_1 = ReplayBuffer(200000)
    replay_buffer_2 = ReplayBuffer(200000)
    dummy_value = torch.zeros(1).to(device)
else:
    EPSILON = 0.95**100

# Hyperparameters
M = 200 # Number of episodes
N = 2000 # Number of timesteps per episode

for episode in range(M):
    EPSILON *= 0.95
    # keep track of rewards
    total_rewards = 0
    actions_record = torch.zeros((2,))

    # Reset states
    dataCenter1.state = torch.zeros(STATE_SIZE).to(device)
    dataCenter2.state = torch.zeros(STATE_SIZE).to(device)
    total_rewards = 0

    jobGenerator = JobGenerator(data_center_num)

    jobs = jobGenerator.generate_job()

    curr_state_1 = torch.cat((dataCenter1.state.to(device), jobs[0].to(device)), 0)
    curr_state_2 = torch.cat((dataCenter2.state.to(device), jobs[1].to(device)), 0)

    advantage_state_1 = torch.cat((dataCenter1.state.to(device), jobs[1].to(device)), 0)
    advantage_state_2 = torch.cat((dataCenter2.state.to(device), jobs[0].to(device)), 0)

    advantage_value_1 = advantage_model_1.forward_pass(advantage_state_1)
    advantage_value_2 = advantage_model_2.forward_pass(advantage_state_2)

    q_values_1 = model_1.forward_pass(curr_state_1)
    q_values_2 = model_2.forward_pass(curr_state_2)

    action1, q_value_1 = epsilon_greedy(q_values_1 + advantage_value_2, EPSILON)
    action2, q_value_2 = epsilon_greedy(q_values_2 + advantage_value_1, EPSILON)


    print("initial actions", action1, action2)

    for timestep in range(N):
        reward1 = 0
        reward2 = 0
        reward = torch.tensor(0.0).to(device)
        reward_from_1 = 0
        reward_from_2 = 0

        if action1 == 0:
            reward_from_1 += dataCenter1.add_job(jobs[0])
        else:
            jobs[0][1] *= 0.8
            reward_from_1 += dataCenter2.add_job(jobs[0])

        if action2 == 0:
            reward_from_2 += dataCenter2.add_job(jobs[1])
        else:
            jobs[1][1] *= 0.8
            reward_from_2 += dataCenter1.add_job(jobs[1])
        
        # print(action1, actions_record)
        actions_record[action1] += 1
        actions_record[action2] += 1

        reward_from_1 += dataCenter1.update(1)
        reward_from_2 += dataCenter2.update(1)

        # equal reward sharing
        # reward1 = (reward_from_1 + reward_from_2) * 0.5
        # reward2 = (reward_from_1 + reward_from_2) * 0.5
        
        # reward sharing based on local work
        reward1 = reward_from_1
        reward2 = reward_from_2

        # reward sharing 70/30 split
        # reward1 = reward_from_1 * 0.7 + reward_from_2 * 0.3
        # reward2 = reward_from_2 * 0.7 + reward_from_1 * 0.3

        # reward sharing with competition penalty
        # reward1 = reward_from_1 * 1.1 - reward_from_2 * 0.1
        # reward2 = reward_from_2 * 1.1 - reward_from_1 * 0.1

        reward = reward1 + reward2

        jobs = jobGenerator.generate_job()

        next_state_1 = torch.cat((dataCenter1.state, jobs[0].to(device)), 0)
        next_state_2 = torch.cat((dataCenter2.state, jobs[1].to(device)), 0)

        total_rewards += reward

        ############################## update replay buffer ##############################

        replay_buffer_1.push(curr_state_1, advantage_state_1, jobs[0].to(device), action1.view(-1).to(device), reward1, next_state_1, action2.view(-1).to(device), jobs[1].to(device))
        replay_buffer_2.push(curr_state_2, advantage_state_2, jobs[1].to(device), action2.view(-1).to(device), reward2, next_state_2, action1.view(-1).to(device), jobs[0].to(device))

        sample_state_1, sample_advantage_state_1, _, sample_action_1, sample_reward_1, sample_next_state_1, sample_action_1_2, _ = replay_buffer_1.sample(BATCH_SIZE)
        sample_state_2, sample_advantage_state_2, _, sample_action_2, sample_reward_2, sample_next_state_2, sample_action_2_1, _ = replay_buffer_2.sample(BATCH_SIZE)

        ############################## update Q-value model ##############################

        replay_q_values_1 = model_1.forward_pass(sample_state_1.detach())
        replay_q_values_2 = model_2.forward_pass(sample_state_2.detach())

        # replay_actual_q_values_1 = model_1.forward_pass(sample_state_1.detach())[torch.arange(sample_state_1.size(0)), sample_action_1.view(-1)]
        # replay_actual_q_values_2 = model_2.forward_pass(sample_state_2.detach())[torch.arange(sample_state_2.size(0)), sample_action_2.view(-1)]
        replay_actual_q_values_1 = replay_q_values_1[torch.arange(sample_state_1.size(0)), sample_action_1.view(-1)]
        replay_actual_q_values_2 = replay_q_values_2[torch.arange(sample_state_2.size(0)), sample_action_2.view(-1)]

        # replay_next_q_values_1 = model_1.forward_pass(sample_next_state_1.detach())
        # replay_next_q_values_2 = model_2.forward_pass(sample_next_state_2.detach())

        replay_next_q_values_1 = model_1_target.forward_pass(sample_next_state_1.detach())
        replay_next_q_values_2 = model_2_target.forward_pass(sample_next_state_2.detach())

        replay_expected_values_1 = sample_reward_1 + 0.95 * torch.max(replay_next_q_values_1, 1)[0]
        replay_expected_values_2 = sample_reward_2 + 0.95 * torch.max(replay_next_q_values_2, 1)[0]

        loss_1 = torch.nn.MSELoss()(replay_expected_values_1.detach(), replay_actual_q_values_1)
        loss_2 = torch.nn.MSELoss()(replay_expected_values_2.detach(), replay_actual_q_values_2)

        loss_1.backward(retain_graph=True)
        loss_2.backward(retain_graph=True)

        optimizer_1.step()
        optimizer_2.step()

        optimizer_1.zero_grad()
        optimizer_2.zero_grad()

        ############################## update Advantage model ##############################

        replay_actual_advantage_values_1 = advantage_model_1.forward_pass(sample_advantage_state_1.detach())[torch.arange(sample_advantage_state_1.size(0)), sample_action_1_2.view(-1)]
        replay_actual_advantage_values_2 = advantage_model_2.forward_pass(sample_advantage_state_2.detach())[torch.arange(sample_advantage_state_2.size(0)), sample_action_2_1.view(-1)]

        replay_expected_advantage_values_1 = replay_expected_values_1 - replay_actual_q_values_1 
        replay_expected_advantage_values_2 = replay_expected_values_2 - replay_actual_q_values_2
        # replay_expected_advantage_values_1 = torch.max(replay_q_values_1, 1)[0] - replay_actual_q_values_1
        # replay_expected_advantage_values_2 = torch.max(replay_q_values_2, 1)[0] - replay_actual_q_values_2

        loss_advantage_1 = torch.nn.MSELoss()(replay_expected_advantage_values_1.detach(), replay_actual_advantage_values_1)
        loss_advantage_2 = torch.nn.MSELoss()(replay_expected_advantage_values_2.detach(), replay_actual_advantage_values_2)

        loss_advantage_1.backward(retain_graph=True)
        loss_advantage_2.backward(retain_graph=True)

        optimizer_advantage_1.step()
        optimizer_advantage_2.step()

        optimizer_advantage_1.zero_grad()
        optimizer_advantage_2.zero_grad()

        ############################## get next actions ##############################
        # Update states and actions

        curr_state_1 = torch.cat((dataCenter1.state.to(device), jobs[0].to(device)), 0)
        curr_state_2 = torch.cat((dataCenter2.state.to(device), jobs[1].to(device)), 0)

        advantage_state_1 = torch.cat((dataCenter1.state.to(device), jobs[1].to(device)), 0)
        advantage_state_2 = torch.cat((dataCenter2.state.to(device), jobs[0].to(device)), 0)

        advantage_value_1 = advantage_model_1.forward_pass(advantage_state_1)
        advantage_value_2 = advantage_model_2.forward_pass(advantage_state_2)

        q_values_1 = model_1.forward_pass(curr_state_1)
        q_values_2 = model_2.forward_pass(curr_state_2)

        action1, q_value_1 = epsilon_greedy(q_values_1 + advantage_value_2, EPSILON)
        action2, q_value_2 = epsilon_greedy(q_values_2 + advantage_value_1, EPSILON)
        
        # print(total_rewards, dataCenter1.state, dataCenter2.state, action1, action2, reward, q_values_1, q_values_2, advantage_value_1, advantage_value_2)

        # update target network
        if timestep % 100 == 0:
            model_1_target.load_state_dict(model_1.state_dict())
            model_2_target.load_state_dict(model_2.state_dict())

    print("we got ", total_rewards, "total reward")
    print("actions", actions_record, EPSILON * 2000)
    print(total_rewards, dataCenter1.state, dataCenter2.state, action1, action2, reward, q_values_1, q_values_2, advantage_value_1, advantage_value_2)


Using device: cuda:0


NameError: name 'ReplayBuffer' is not defined

In [366]:
# # save the model, optimizer, and replay buffer in a dictionary

# torch.save({
#     'model_1': model_1.state_dict(),
#     'model_2': model_2.state_dict(),
#     'advantage_model_1': advantage_model_1.state_dict(),
#     'advantage_model_2': advantage_model_2.state_dict(),
#     'optimizer_1': optimizer_1.state_dict(),
#     'optimizer_2': optimizer_2.state_dict(),
#     'optimizer_advantage_1': optimizer_advantage_1.state_dict(),
#     'optimizer_advantage_2': optimizer_advantage_2.state_dict(),
# }, 'advantage_competition_sharing_double_q.pth')

In [351]:
fit_model = AdvantageMARL()

optimzer = torch.optim.Adam(fit_model.parameters(), lr=0.001)

for i in range(1000):
    

43037


## Transformer

In [169]:
# Main Simulation

EPSILON = 1

# configs
torch.autograd.set_detect_anomaly(True)


# Check if GPU is available and if so, use it
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(f"Using device: {device}")

# Create Data Centers
data_center_num = 2
dataCenter1 = DataCenter(device)
dataCenter2 = DataCenter(device)


M = 100 #500
N = 2000 #3000

# M = 500
# N = 3000

for episode in range(M):
    EPSILON *= 0.95
    # keep track of rewards
    total_rewards = 0
    actions = torch.zeros((2,))

    # for debugging
    current_episode_jobs = []
    current_episode_actions = []

    # create a loss function
    dataCenter1.state = torch.zeros(STATE_SIZE).to(device)
    dataCenter2.state = torch.zeros(STATE_SIZE).to(device)
    total_rewards = 0

    jobGenerator = JobGenerator(data_center_num)

    l_time = 0
    r_time = 0
    import time

    jobs = jobGenerator.generate_job()
    current_episode_jobs.append(jobs)

    # get initial actions
    q_values_1 = dataCenter1.get_q_values(dataCenter2.representations.unsqueeze(0), jobs[0].to(device))
    q_values_2 = dataCenter2.get_q_values(dataCenter1.representations.unsqueeze(0), jobs[1].to(device))

    action1, q_value_1 = epsilon_greedy(q_values_1, EPSILON)
    action2, q_value_2 = epsilon_greedy(q_values_2, EPSILON)
    current_episode_actions.append((action1, action2))

    for timestep in range(N):
        # update according to action
        pre = time.time()

        reward1 = 0
        reward2 = 0

        reward = torch.tensor(0.0).to(device)

        if action1 == 0:
            reward += dataCenter1.add_job(jobs[0])
        else:
            jobs[0][1] *= 0.8
            reward += dataCenter2.add_job(jobs[0])

        if action2 == 0:
            reward += dataCenter2.add_job(jobs[1])
        else:
            jobs[1][1] *= 0.8
            reward += dataCenter1.add_job(jobs[1])

        actions[action1] += 1
        actions[action2] += 1

        reward += dataCenter1.update(1)
        reward += dataCenter2.update(1)

        post = time.time()
        l_time += post - pre
        

        
        # update representations
        pre = time.time()
        jobs = jobGenerator.generate_job()
        current_episode_jobs.append(jobs)

        # get the representations
        dataCenter1.update_rep(dataCenter2.representations.unsqueeze(0))
        dataCenter2.update_rep(dataCenter1.representations.unsqueeze(0))

        # get next actions
        new_q_values_1 = dataCenter1.get_q_values(dataCenter2.representations.unsqueeze(0), jobs[0].to(device))
        new_q_values_2 = dataCenter2.get_q_values(dataCenter1.representations.unsqueeze(0), jobs[1].to(device))

        new_action1, new_q_value_1 = torch.argmax(new_q_values_1), torch.max(new_q_values_1)
        new_action2, new_q_value_2 = torch.argmax(new_q_values_2), torch.max(new_q_values_2)

        post = time.time()
        r_time += post - pre

        # handle rewards
        total_rewards += reward
        reward1 = reward / 2
        reward2 = reward / 2

        # print(f"{timestep}th loop", reward, total_rewards, dataCenter1.state, dataCenter1.representations, action1, action2)

        # backprop
        expected_value_1 = reward1 + 0.9 * torch.max(new_q_values_1)
        actual_value_1 = q_value_1.to(device)
        loss_1 = torch.nn.MSELoss()(expected_value_1.detach(), actual_value_1)
        loss_1.backward(retain_graph=True)

        expected_value_2 = reward2 + 0.9 * torch.max(new_q_values_2)
        actual_value_2 = q_value_2.to(device)
        loss_2 = torch.nn.MSELoss()(expected_value_2.detach(), actual_value_2)
        loss_2.backward(retain_graph=True)

        if episode % 10 == 0:
            dataCenter1.backprop()
            dataCenter2.backprop()

        # # need to re-calculate the q_values to avoid issues in the backpropagation
        
        # get the representations
        dataCenter1.update_rep(dataCenter2.representations.unsqueeze(0))
        dataCenter2.update_rep(dataCenter1.representations.unsqueeze(0))

        # get initial actions
        q_values_1 = dataCenter1.get_q_values(dataCenter2.representations.unsqueeze(0), jobs[0].to(device))
        q_values_2 = dataCenter2.get_q_values(dataCenter1.representations.unsqueeze(0), jobs[1].to(device))

        action1, q_value_1 = epsilon_greedy(q_values_1, EPSILON)
        action2, q_value_2 = epsilon_greedy(q_values_2, EPSILON)
        current_episode_actions.append((action1, action2))

        # action1, q_value_1 = new_action1, new_q_value_1
        # action2, q_value_2 = new_action2, new_q_value_2

    print("we got ", total_rewards, "total reward; actions taken are", actions, 200*EPSILON)

    #print(total_rewards, l_time, r_time)

Using device: cuda:0


  x = torch.nn.functional.softmax(x)


we got  tensor(125.3600, device='cuda:0') total reward; actions taken are tensor([207., 193.]) 190.0
we got  tensor(129.6600, device='cuda:0') total reward; actions taken are tensor([225., 175.]) 180.5
we got  tensor(119.1600, device='cuda:0') total reward; actions taken are tensor([223., 177.]) 171.47499999999997
we got  tensor(98.8400, device='cuda:0') total reward; actions taken are tensor([238., 162.]) 162.90124999999998
we got  tensor(123.1400, device='cuda:0') total reward; actions taken are tensor([243., 157.]) 154.75618749999998
we got  tensor(115.6800, device='cuda:0') total reward; actions taken are tensor([246., 154.]) 147.01837812499997
we got  tensor(116.1000, device='cuda:0') total reward; actions taken are tensor([250., 150.]) 139.66745921874994
we got  tensor(129.1600, device='cuda:0') total reward; actions taken are tensor([255., 145.]) 132.68408625781245
we got  tensor(107.8000, device='cuda:0') total reward; actions taken are tensor([270., 130.]) 126.04988194492182
w

KeyboardInterrupt: 

In [512]:
print(current_episode_jobs, current_episode_actions)

[[tensor([8.0000, 0.6000]), tensor([2.0000, 0.2000])], [tensor([8.0000, 0.8000]), tensor([2.0000, 0.2000])], [tensor([2., 0.]), tensor([4.0000, 0.4000])], [tensor([2.0000, 0.2000]), tensor([4.0000, 0.4000])], [tensor([8.0000, 0.6000]), tensor([8.0000, 0.6000])], [tensor([4.0000, 0.2000]), tensor([2.0000, 0.2000])], [tensor([8.0000, 0.6000]), tensor([8.0000, 0.8000])], [tensor([8.0000, 0.6000]), tensor([2.0000, 0.2000])], [tensor([4.0000, 0.2000]), tensor([2.0000, 0.2000])], [tensor([8.0000, 0.8000]), tensor([4.0000, 0.4000])], [tensor([2., 0.]), tensor([4.0000, 0.4000])], [tensor([4.0000, 0.2000]), tensor([2.0000, 0.2000])], [tensor([4.0000, 0.4000]), tensor([8.0000, 0.8000])], [tensor([8.0000, 0.6000]), tensor([8.0000, 0.8000])], [tensor([8.0000, 0.8000]), tensor([8.0000, 0.8000])], [tensor([8.0000, 0.6000]), tensor([8.0000, 0.6000])], [tensor([2.0000, 0.2000]), tensor([8.0000, 0.6000])], [tensor([4.0000, 0.4000]), tensor([2.0000, 0.2000])], [tensor([8.0000, 0.6000]), tensor([4.0000, 

In [43]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = AdvantageMARL().to(device)
advantage = AdvantageMARL().to(device)

# load parameters
model.load_state_dict(torch.load("advantage_no_sharing_117.pth")["model_1"])
advantage.load_state_dict(torch.load("advantage_no_sharing_117.pth")["advantage_model_1"])

for N in [2,3,4,5,6,7]:
    dataCenters = [DataCenter(device) for _ in range(N)]
    jobGenerator = JobGenerator(N)
    
    total_rewards = 0
    reward_history = []

    for episode in range(2000):
        jobs = jobGenerator.generate_job()
        # advantages = [advantage.forward_pass(dataCenters[i].state.to(device)) for i in range(N)]
        actions = []
        for i in range(N):
            q_values = model.forward_pass(torch.cat((dataCenters[i].state.to(device), jobs[i].to(device))).to(device))
            value_local = q_values[0]
            value_local_max = value_local
            value_send = q_values[1]
            value_send_max = -1000, -1
            for j in range(N):
                adv_values = advantage.forward_pass(torch.cat((dataCenters[j].state.to(device), jobs[i].to(device))).to(device))
                value_local += adv_values[0]/(N-1)
                # value_local_max = max(value_local_max, value_local+ adv_values[0])
                value_send_max = max(value_send_max, (value_send+ adv_values[1],j))
            if value_send_max[0] > value_local_max:
                actions.append(value_send_max[1])
            else:
                actions.append(i)
            # actions.append(value_send_max[1])
            
        reward = 0
        for i in range(N):
            if actions[i] == i:
                reward += dataCenters[i].add_job(jobs[i])
            else:
                jobs[i][1] *= 0.8
                reward = dataCenters[actions[i]].add_job(jobs[i])
            reward += dataCenters[i].update(1)
        total_rewards += reward
        reward_history.append(reward)
        print(total_rewards, actions)
# print(reward_history, total_rewards)
print("------------")
        # print(advantages)

tensor(0.8000, device='cuda:0') [0, 1]
tensor(0.8000, device='cuda:0') [0, 0]
tensor(2.3000, device='cuda:0') [0, 1]
tensor(3.2200, device='cuda:0') [1, 1]
tensor(3.8200, device='cuda:0') [0, 1]
tensor(3.8200, device='cuda:0') [0, 0]
tensor(5.6000, device='cuda:0') [0, 1]
tensor(6.9000, device='cuda:0') [0, 1]
tensor(7.8000, device='cuda:0') [0, 1]
tensor(8.6000, device='cuda:0') [0, 1]
tensor(8.6000, device='cuda:0') [0, 0]
tensor(10., device='cuda:0') [0, 1]
tensor(10., device='cuda:0') [0, 0]
tensor(11.5000, device='cuda:0') [0, 1]
tensor(12.8000, device='cuda:0') [0, 1]
tensor(12.8000, device='cuda:0') [0, 0]
tensor(13.7600, device='cuda:0') [1, 1]
tensor(14.7300, device='cuda:0') [0, 1]
tensor(15.2300, device='cuda:0') [0, 1]
tensor(15.3900, device='cuda:0') [1, 1]
tensor(16.0700, device='cuda:0') [0, 1]
tensor(16.6700, device='cuda:0') [0, 1]
tensor(17.2700, device='cuda:0') [0, 1]
tensor(17.9100, device='cuda:0') [1, 1]
tensor(18.5100, device='cuda:0') [0, 1]
tensor(19.2100, dev