In [19]:
STATE_SIZE = 28
QUERY_SIZE = 1
VALUE_SIZE = 4
# second config: no information is passed
VALUE_SIZE = 1


JOB_SIZE = 2

# import necessary libraries
import torch
import torch.nn as nn

class DataCenter():
    def __init__(self, device):
        # self.data_center_id = data_center_id
        # self.machine_num = machine_num
        # self.queue_num = queue_num
        self.state = torch.zeros(STATE_SIZE).to(device)
        # self.compressor = StateCompressor(STATE_SIZE, QUERY_SIZE, VALUE_SIZE, device=device)
        # self.dqn = DQN(STATE_SIZE, VALUE_SIZE)
        self.representations = torch.zeros(VALUE_SIZE).to(device)

        self.device = device

        # self.dqn_optimizer = torch.optim.Adam(self.dqn.parameters(), lr=0.001)
        # self.compressor_optimizer = torch.optim.Adam(self.compressor.parameters(), lr=0.001)
    
    def update(self, delta):
        with torch.no_grad():
            # reward = gains from successful job allocation - losses from queueing delay
            reward = torch.tensor(0.0).to(self.device)
            # Separate machine states and queue states
            machines = self.state[:10].view(5, 2).clone()
            queues = self.state[10:].view(6, 3).clone()

            # Update machine states
            machines[:, 1] = torch.maximum(torch.zeros_like(machines[:, 1]), machines[:, 1] - delta)
            machines[machines[:, 1] == 0, 0] = 0

            # Find available machines and assign jobs from the queue
            for i in range(queues.size(0)):
                if queues[i, 0] > 0:
                    # Find first available machine
                    available_machine_index = torch.nonzero(machines[:, 0] == 0, as_tuple=False)
                    if available_machine_index.size(0) > 0:
                        first_available = available_machine_index[0].item()
                        machines[first_available, 0] = 1
                        machines[first_available, 1] = queues[i, 1]
                        reward += queues[i, 2]
                        queues[i, :] = 0
                else:
                    break
            # move remaining jobs to the front
            queues = torch.cat((queues[queues[:, 0] > 0], queues[queues[:, 0] == 0]), 0)
            

            
            # queues[:, 2] = torch.maximum(torch.zeros_like(queues[:, 2]), queues[:, 2] - 0.1)
            queues[:, 2] *= 0.9

            # Merge the updated machine and queue states back into self.state
            self.state = torch.cat((machines.view(-1), queues.view(-1)))

            return reward
    
    def update_rep(self, remote_info):
        new_reps = self.compressor.forward_pass(self.state, remote_info)
        # print(new_reps.size(), self.representations.size())
        assert new_reps.size() == self.representations.size()
        self.representations = new_reps
    
    def get_q_values(self, reps, job):
        batch_input = torch.zeros((2, STATE_SIZE + VALUE_SIZE + JOB_SIZE + 1))
        batch_input[0] = torch.cat((self.state, self.representations, job, torch.ones(1).to(self.device)), 0)
        for i in range(reps.size(0)):
            batch_input[i+1] = torch.cat((self.state, reps[i], job, torch.zeros(1).to(self.device)), 0)
            # expand the concat into several instructions
        
        # batch_input[0, :STATE_SIZE] = self.state
        # # batch_input[0, STATE_SIZE:STATE_SIZE+VALUE_SIZE] = self.representations
        # batch_input[0, STATE_SIZE+VALUE_SIZE:STATE_SIZE+VALUE_SIZE+JOB_SIZE] = job

        batch_input.to(self.device)
        q_values = self.dqn.forward_pass(batch_input)
        return q_values

    # add job to the queue of the data center
    def add_job(self, job):
        reward = 0
        state = self.state.clone()
        for i in range(6):
            if state[10+i*3] == 0:
                state[10+i*3] = 1
                state[10+i*3+1] = job[0]
                state[10+i*3+2] = job[1]
                break
        else:
            reward -= 0.2
        self.state = state
        return reward


    
    # how to do this?
    def backprop(self):
        self.dqn_optimizer.step()
        self.compressor_optimizer.step()

        self.dqn_optimizer.zero_grad()
        self.compressor_optimizer.zero_grad()


class JobGenerator():
    def __init__(self, data_center_num) -> None:
        self.underlying_state = torch.randint(0, 2, (data_center_num,))
        self.data_center_num = data_center_num

    def generate_job(self):
        jobs = []
        for i in range(self.data_center_num):
            if torch.rand(1).item() < 0.02:
                self.underlying_state[i] = 1 - i
                
            seed = torch.rand(1).item()
            if self.underlying_state[i] == 1:
                # choose high workload
                if seed < 0.4:
                    jobs.append((10, 1.0))
                elif seed < 0.7:
                    jobs.append((6, 0.6))
                else:
                    jobs.append((4, 0.4))
            else:
                # choose low workload
                if seed < 0.3:
                    jobs.append((4, 0.4))
                elif seed < 0.7:
                    jobs.append((3, 0.3))
                else:
                    jobs.append((2, 0.2))
        return [torch.tensor(j) for j in jobs]


def epsilon_greedy(q_values, epsilon):
    action = None
    if torch.rand(1).item() < epsilon:
        action = torch.randint(0, q_values.size(0), (1, ))
    else:
        action = torch.argmax(q_values)#.unsqueeze(0)
    q_value = q_values[action]
    return action, q_value


import random

class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def push(self, state, rep, job, action, reward, next_state, next_rep, next_job):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, rep, job, action, reward, next_state, next_rep, next_job)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        # zip into torch tensors
        # return zip(*random.sample(self.buffer, batch_size))
        batch = random.sample(self.buffer, min(batch_size, len(self.buffer)))
        state, rep, job, action, reward, next_state, next_rep, next_job = zip(*batch)
        return torch.stack(list(state)), torch.stack(list(rep)), torch.stack(list(job)), torch.stack(list(action)), torch.stack(list(reward)), torch.stack(list(next_state)), torch.stack(list(next_rep)), torch.stack(list(next_job))

    def __len__(self):
        return len(self.buffer)

BATCH_SIZE = 256

In [None]:

# define IL DQN model
import torch
import torch.nn as nn

class AdvantageMARL(nn.Module):
    def __init__(self):
        super(AdvantageMARL, self).__init__()
        self.layer1 = nn.Linear(30, 256)
        self.layer2 = nn.Linear(256, 128)
        self.layer3 = nn.Linear(128, 128)
        self.layer4 = nn.Linear(128, 128)
        self.layer5 = nn.Linear(128, 2)

    def __str__(self):
        return f'Neural Network with input layer {self.input_layer}, hidden layer 1 {self.hidden_layer_1}, hidden layer 2 {self.hidden_layer_2}, hidden layer 3 {self.hidden_layer_3}, hidden layer 4 {self.hidden_layer_4}, and output layer {self.output_layer}'

    def __repr__(self):
        return self.__str__()
    
    def forward_pass(self, input_data):
        x = self.layer1(input_data)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer2(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer3(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer4(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer5(x)
        return x

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

models = [AdvantageMARL().to(device) for _ in range(2)]
advantage = [AdvantageMARL().to(device) for _ in range(2)]

# load parameters
model_data = torch.load("advantage_no_sharing_double_q.pth")
models[0].load_state_dict(model_data["model_1"])
models[1].load_state_dict(model_data["model_2"])
advantage[0].load_state_dict(model_data["advantage_model_1"])
advantage[1].load_state_dict(model_data["advantage_model_2"])

N = 2
for p in [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]:
    for iter in range(10):
        print(p, iter)
        dataCenters = [DataCenter(device) for _ in range(N)]
        jobGenerator = JobGenerator(N)
        
        total_rewards = 0
        reward_history = []

        q_look_up = {}
        adv_look_up = {}

        for episode in range(2000):
            jobs = jobGenerator.generate_job()
            # advantages = [advantage.forward_pass(dataCenters[i].state.to(device)) for i in range(N)]
            actions = []
            for i in range(N):
                if i not in q_look_up or random.random() < p:
                    q_look_up[i] = models[i].forward_pass(torch.cat((dataCenters[i].state.to(device), jobs[i].to(device))).to(device))
                q_values = q_look_up[i]
                # q_values = models[i].forward_pass(torch.cat((dataCenters[i].state.to(device), jobs[i].to(device))).to(device))
                value_local = q_values[0]
                value_send = q_values[1]
                value_send_max = -1000, -1
                for j in range(N):
                    if i == j:
                        continue
                    if (i,j) not in adv_look_up or random.random() < p: 
                        adv_look_up[(i,j)] = advantage[j].forward_pass(torch.cat((dataCenters[j].state.to(device), jobs[i].to(device))).to(device))
                    adv_values = adv_look_up[(i,j)]
                    # adv_values = advantage[j].forward_pass(torch.cat((dataCenters[j].state.to(device), jobs[i].to(device))).to(device))
                    value_local += adv_values[0]/(N-1)
                    # value_local_max = max(value_local_max, value_local+ adv_values[0])
                    value_send_max = max(value_send_max, (value_send+ adv_values[1],j))
                if value_send_max[0] > value_local:
                    actions.append(value_send_max[1])
                else:
                    actions.append(i)
                # actions.append(value_send_max[1])
                
            reward = 0
            for i in range(N):
                if actions[i] == i:
                    reward += dataCenters[i].add_job(jobs[i])
                else:
                    jobs[i][1] *= 0.8
                    reward += dataCenters[actions[i]].add_job(jobs[i])
                reward += dataCenters[i].update(1)
            total_rewards += reward
            reward_history.append(reward)
        print(total_rewards, actions)
        # print(reward_history, total_rewards)
        print("------------")
            # print(advantages)

In [27]:
BATCH_SIZE = 32


# define IL DQN model
import torch
import torch.nn as nn

class AdvantageMARL(nn.Module):
    def __init__(self):
        super(AdvantageMARL, self).__init__()
        self.layer1 = nn.Linear(30, 256)
        self.layer2 = nn.Linear(256, 128)
        self.layer3 = nn.Linear(128, 128)
        self.layer4 = nn.Linear(128, 128)
        self.layer5 = nn.Linear(128, 2)

    def __str__(self):
        return f'Neural Network with input layer {self.input_layer}, hidden layer 1 {self.hidden_layer_1}, hidden layer 2 {self.hidden_layer_2}, hidden layer 3 {self.hidden_layer_3}, hidden layer 4 {self.hidden_layer_4}, and output layer {self.output_layer}'

    def __repr__(self):
        return self.__str__()
    
    def forward_pass(self, input_data):
        x = self.layer1(input_data)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer2(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer3(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer4(x)
        x = torch.nn.functional.leaky_relu(x)
        x = self.layer5(x)
        return x

class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def push(self, kwargs):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = kwargs
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        # zip into torch tensors
        # return zip(*random.sample(self.buffer, batch_size))
        # batch = random.sample(self.buffer, min(batch_size, len(self.buffer)))
        batch = random.choices(self.buffer, k=min(batch_size, len(self.buffer)))
        return [torch.stack(value) for value in zip(*batch)]

    def __len__(self):
        return len(self.buffer)

# Main Simulation

# configs
torch.autograd.set_detect_anomaly(True)
EPSILON = 0.5

# Check if GPU is available and if so, use it
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(f"Using device: {device}")

if True:
    # Initialize Data Centers, Models, and Optimizers
    dataCenters = [DataCenter(device) for _ in range(4)]
    models = [AdvantageMARL().to(device) for _ in range(4)]
    # for double-DQN
    model_targets = [AdvantageMARL().to(device) for _ in range(4)]
    advantage_models = [AdvantageMARL().to(device) for _ in range(4)]
    optimizers = [torch.optim.Adam(model.parameters(), lr=0.001) for model in models]
    advantage_optimizers = [torch.optim.Adam(advantage_model.parameters(), lr=0.001) for advantage_model in advantage_models]

    # load models
    dic = torch.load("advantage_no_sharing_double_q_200_episodes.pth")
    print(dic.keys())
    for i in range(4):
        k = i % 2 + 1
        models[i].load_state_dict(dic[f"model_{k}"])
        model_targets[i].load_state_dict(dic[f"model_{k}"])
        advantage_models[i].load_state_dict(dic[f"advantage_model_{k}"])
        optimizers[i].load_state_dict(dic[f"optimizer_{k}"])
        advantage_optimizers[i].load_state_dict(dic[f"optimizer_advantage_{k}"])
    # Initialize Replay Buffer
    # replay_buffer_1 = ReplayBuffer(200000)
    # replay_buffer_2 = ReplayBuffer(200000)
    dummy_value = torch.zeros(1).to(device)
else:
    EPSILON = 0.5*0.99**2

replay_buffers = [ReplayBuffer(50000) for _ in range(4)]



# Hyperparameters
M = 200 # Number of episodes
N = 2000 # Number of timesteps per episode

for episode in range(M):
    print(f"Episode {episode}")
    EPSILON *= 0.95
    # keep track of rewards
    total_rewards = 0
    actions_record = torch.zeros((4,))

    # Reset states
    for dataCenter in dataCenters:
        dataCenter.state = torch.zeros(STATE_SIZE).to(device)

    jobGenerator = JobGenerator(4)

    jobs = jobGenerator.generate_job()

    actions = [i for i in range(4)]
    agents_q_values = [None for _ in range(4)]

    for timestep in range(N):
        if timestep % 10 == 0:
            print(f"timestep {timestep}", actions)
        current_states = [torch.cat((dataCenter.state, jobs[i].to(device)), 0) for i, dataCenter in enumerate(dataCenters)]

        for source in range(4):
            advantage_states = [torch.cat((dataCenter.state, jobs[source].to(device)), 0) for dataCenter in dataCenters]
            advantage_values = [advantage_model.forward_pass(advantage_states[i]) for i, advantage_model in enumerate(advantage_models)]

            q_value = models[source].forward_pass(current_states[source])
            q_values = torch.tensor([q_value[0] if i == source else q_value[1] for i in range(4)]).to(device)

            for i, advantage_value in enumerate(advantage_values):
                if i == source:
                    continue
                q_values[i] += advantage_value[1]/3
                q_values[source] += advantage_value[0]
            action, q_value = epsilon_greedy(q_values, EPSILON)
            actions[source] = action.to(device).reshape((1,))
            agents_q_values[source] = q_value
            
        rewards = torch.zeros((4,)).to(device)

        for i in range(4):
            if actions[i] == i:
                rewards[i] += dataCenters[i].add_job(jobs[i])
            else:
                jobs[i][1] *= 0.8
                rewards[i] += dataCenters[actions[i]].add_job(jobs[i])

        for i in range(4):
            rewards[i] += dataCenters[i].update(1)


        jobs = jobGenerator.generate_job()

        next_states = [torch.cat((dataCenter.state, jobs[i].to(device)), 0) for i, dataCenter in enumerate(dataCenters)]

        total_rewards += torch.sum(rewards)

        # ############################## update replay buffer ##############################

        for i in range(4):
            replay_buffers[i].push([current_states[i][:28]] + jobs + actions + [rewards[i], next_states[i]])
        
        if True:
            for i in range(4):
                # print(replay_buffers[i].buffer)
                replay_data = replay_buffers[i].sample(BATCH_SIZE)
                replay_state = replay_data[0]
                replay_jobs = replay_data[1:5]
                replay_actions = replay_data[5:9]
                replay_reward = replay_data[9]
                replay_next_state = replay_data[10]
                

                # ############################## update Q-value model ##############################
                # print(replay_state.size(), replay_jobs[0].size())
                replay_q_values = models[i].forward_pass(torch.concat((replay_state.detach(),replay_jobs[i].detach().to(device)), 1))

                replay_actual_q_values = replay_q_values[torch.arange(replay_state.size(0)), (replay_actions[i].view(-1)!=i).int()]

                replay_next_q_values = model_targets[i].forward_pass(replay_next_state.detach())
                replay_expected_values = replay_reward + 0.95 * torch.max(replay_next_q_values, 1)[0]

                loss = torch.nn.MSELoss()(replay_expected_values.detach(), replay_actual_q_values)
                loss.backward(retain_graph=True)
                optimizers[i].step()
                optimizers[i].zero_grad()

                # ############################## update Advantage model ##############################
                replay_actual_advantage_values = torch.ones((replay_state.size(0),)).to(device)
                for j in range(4):
                    if i == j:
                        continue
                    replay_action_0_1 = (replay_actions[j] == i).int().squeeze(1)
                    replay_tempt_advantages = advantage_models[j].forward_pass(torch.cat((replay_state.detach(), replay_jobs[j].detach().to(device)), 1))
                    # print(replay_tempt_advantages.size(), replay_action_0_1.size())
                    replay_tempt_advantages[:, 0] *= 3
                    replay_temp_part_advantage = replay_tempt_advantages[torch.arange(replay_state.size(0)), replay_action_0_1]
                    replay_actual_advantage_values += replay_temp_part_advantage

                replay_expected_advantage_values = replay_expected_values - replay_actual_q_values
                
                loss_advantage = torch.nn.MSELoss()(replay_expected_advantage_values.detach(), replay_actual_advantage_values)

                loss_advantage.backward(retain_graph=True)
                advantage_optimizers[i].step()
                advantage_optimizers[i].zero_grad()
            
            # print(total_rewards, dataCenter1.state, dataCenter2.state, action1, action2, reward, q_values_1, q_values_2, advantage_value_1, advantage_value_2)
            print(replay_expected_advantage_values)
        # update target network
        if timestep % 100 == 0:
            for model, model_target in zip(models, model_targets):
                model_target.load_state_dict(model.state_dict())
        # print(q_values)
        print(actions)
        print(rewards)
        print(agents_q_values)
        for state in current_states:
            print(state)
        
        print("------------")

    print("we got ", total_rewards, "total reward")
    print("actions", actions_record, EPSILON * 2000)
    # print(total_rewards, current_states, actions, reward, agents_q_values, advantage_values)


Using device: cuda:0
dict_keys(['model_1', 'model_2', 'advantage_model_1', 'advantage_model_2', 'optimizer_1', 'optimizer_2', 'optimizer_advantage_1', 'optimizer_advantage_2'])
Episode 0
timestep 0 [0, 1, 2, 3]
tensor([-0.2346], device='cuda:0', grad_fn=<SubBackward0>)
[tensor([1], device='cuda:0'), tensor([0], device='cuda:0'), tensor([2], device='cuda:0'), tensor([1], device='cuda:0')]
tensor([0.8000, 0.4800, 1.0000, 0.0000], device='cuda:0')
[tensor(8.5585, device='cuda:0', grad_fn=<SelectBackward0>), tensor([8.7787], device='cuda:0', grad_fn=<IndexBackward0>), tensor([10.4155], device='cuda:0', grad_fn=<IndexBackward0>), tensor(8.2352, device='cuda:0', grad_fn=<SelectBackward0>)]
tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 3.0000, 0.3000], device='cuda:0')
tensor([ 0.,  0.,