In [1]:
import torch
import copy

In [2]:
class DQN:
    def __init__(self, state_size,action_size = 4):
        l1 = state_size
        l2 = 24
        l3 = 24
        l4 = action_size
        self.model = torch.nn.Sequential(
        torch.nn.Linear(l1, l2),
        torch.nn.ReLU(),
        torch.nn.Linear(l2, l3),
        torch.nn.ReLU(),
        torch.nn.Linear(l3,l4))

        self.model2 = copy.deepcopy(self.model)
        self.model2.load_state_dict(self.model.state_dict())
        self.loss_fn = torch.nn.MSELoss()
        self.learning_rate = 0.001
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)

# The function "update_target" copies the state of the prediction network to the target network. You need to use this in regular intervals.
    def update_target(self):
        self.model2.load_state_dict(self.model.state_dict())

# The function "get_qvals" returns a numpy list of qvals for the state given by the argument based on the prediction network.
    def get_qvals(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        q_values = self.model(state)
        return q_values

# The function "get_maxQ" returns the maximum q-value for the state given by the argument based on the target network.
    def get_maxQ(self,state):
        q_values = self.model(state)
        return torch.max(q_values).item()

# The function "train_one_step_new" performs a single training step.
# It returns the current loss (only needed for debugging purposes).
# Its parameters are three parallel lists: a minibatch of states, a minibatch of actions,
# a minibatch of the corresponding TD targets and the discount factor.
    def train_one_step(self, states, actions, targets):
        targets_reply = []
        state1_batch = torch.cat([torch.from_numpy(s).float() for s in states])
        action_batch = torch.Tensor(actions)
        Q1 = self.model(state1_batch)
        X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
        Y = torch.tensor(targets).float()
        loss = self.loss_fn(X, Y)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()

In [4]:
class Agent:
    def __init__(self, start_position, epsilon):
        self.position = position
        self.picked = False
        self.reward = 0
        self.done = False
        self.dqn = initializeDQN()
        self.epsilon = epsilon
    def action(self, state):

        ### INSERT THE DAIRY QUEEN!
        # Epsilon-greedy action selection
        if np.random.random() > epsilon: # zero and 1 random selection number comparison with epsilon
            return np.argmax(action_values.cpu().data.numpy())  # should epsilon be smaller, this retrieves the calculated values from processing unit
                                                                # puts it into a numpy array. picks the highest reward
        else:
            return random.choice(np.arange(self.action_space)) # randomly makes a choice for its next step.

        return np.random.choice([0, 1, 2, 3]) #of the legal moves pic one at random to do.

    def next_position(self, action): # HERE the 0-3 is passed in and used.
            x_disp, y_disp = {
                0: (-1, 0), # up
                1: (1, 0), #down
                2: (0, -1), #left
                3: (0, 1) #right
                }[action]

            return (min(max(self.start_position[0] + x_disp, 0),self.grid_size[0] - 1),
                    min(max(self.start_position[1] + y_disp, 0),self.grid_size[0] - 1))# Here is where the boundary is checked.

    def move(self, action): # This actually executes the move. Action is passed in and executed.
        self.start_position = self.next_position(action)

    def remember(self, state, action, reward, next_state, done):
        self.buff.add(state, action, reward, next_state, done)

    def learn(self):
        self.buff.sample()
    def initializeDQN():
        return (DQN(state_size))


In [7]:
import numpy.random as npr
class MultiAgentGridWorld:

    ############################################################################################################################
    def __init__(self, size=5, agents_each = 2, eps=0.01):
        self.size = size # Grid world size default to 5.
        self.pick_up_position = (0, size - 1) # This puts the box in upper left corner of any size map.
        self.delivery_position = (size - 1, 0) # The delivery is in lower right corner for any size map.
        self.agents_each = agents_each
        self.pick_agents = []
        self.drop_agents = []
        self.agents = self.initialize_agents() #This list of tuples is carrying all agent objects
        #the world has these attributes.
        self.pick_reward = 10
        self.drop_reward = 10
        self.handover_reward = 20
        self.steps = 0
        self.done = False
        self.memory = []
    def initialize_agents(self):
        # Ensuring agents don't spawn on pickup or delivery positions.
        possible_positions = [(i, j)
                            for i in range(self.size)
                            for j in range(self.size)
                            if (i, j) not in [self.pick_up_position, self.delivery_position]]
        # Add 2 agents of each type
        indexes = npr.choice(len(possible_positions), 4, replace=False)
        start_positions = [possible_positions[_] for _ in indexes]
        for i in range(self.agents_each):
            pick_agents.append(Agent(start_positions[i]),eps)
            drop_agents.append(Agent(start_positions[i+2]),eps)

    ############################################################################################################################
    def step(self, actions): # so this method needs a bunch of actions.
        state = [[p_a.position for p_a in self.pick_agents],[d_a.position for d_a in self.drop_agents],[p_a.picked for p_a in self.pick_agents],[d_a.picked for d_a in self.drop_agents],[p_a.done for p_a in self.pick_agents],[d_a.done for d_a in self.drop_agents]]
        self.steps += 1
        next_states = []
        actions_p = []
        actions_d = []
        picked_p = []
        picked_d = []
        done_p = []
        done_d = []
        rewards_p = []
        rewards_d = []
        next_positions_p = []
        next_positions_d = []
        for agent in self.pick_agents:
            action = agent.action(state)
            next_pos = agent.next_position(action)
            actions_p.append(agent.action(state))
            next_positions_p.append(next_pos)
        for agent in self.drop_agents:
            action = agent.action(state)
            next_pos = agent.next_position(action)
            actions_d.append(agent.action(state))
            next_positions_d.append(next_pos)
        self.actions.append(actions_p,actions_d)
        for i in range(self.agents_each):
            if self.pick_agents[i].done == False:
                done_p[i] = self.pick_agents[i].done
                if next_positions_p[i] == self.pick_up_position and self.pick_agents[i].picked == False:
                    picked_p[i] = True
                    rewards_p[i] = self.pick_agents[i].reward + self.pick_reward - self.steps**2
                else:
                    picked_p[i] = self.pick_agents[i].picked
                    rewards_p[i] = self.pick_agents[i].reward
                for j in range(self.agents_each):
                    if next_positions_p[i] == next_positions_d[j] and self.pick_agents[i].picked == True and self.drop_agents[j].picked == False and self.drop_agents[j].done == False:
                        picked_p[i] = False
                        picked_d[j] = True
                        done_p[i] = True
                        rewards_p[i] = self.pick_agents[i].reward + self.handover_reward - self.steps**2
                        rewards_d[j] = self.drop_agents[j].reward + self.handover_reward - self.steps**2
            else:
                done_p[i] = self.pick_agents[i].done
                picked_p[i] = self.pick_agents[i].picked
                rewards_p[i] = self.pick_agents[i].reward
                
            if self.drop_agents[i].done == False:
                done_d[i] = self.drop_agents[i].done
                if next_positions_d[i] == self.delivery_position and self.drop_agents[i].picked == True:
                    picked_d[i] = False
                    done_d[i] = True
                    rewards_d[i] = self.drop_agents[i].reward + self.drop_reward - self.steps**2
                else:
                    picked_d[i] = self.drop_agents[i].picked
                    rewards_d[i] = self.drop_agents[i].reward
            else:
                done_d[i] = self.drop_agents[i].done
                picked_d[i] = self.drop_agents[i].picked
                rewards_d[i] = self.drop_agents[i].reward
        actions = [actions_p,actions_d] 
        next_states = [next_positions_p,next_positions_d,picked_p,picked_d,done_p,done_d]
        rewards = [rewards_p,rewards_d]
        if sum(1 for i in done_p if i == True) == len(done_p) and sum(1 for i in done_d if i == True) == len(done_d):
            self.done = True
        self.memory.append(state,actions,rewards,next_states, self.done)

        #return self.state_of_the_world(), self.done # So each time a step is taken the comeback is these three things.

    ############################################################################################################################
    def state_of_the_world(self): # This environment only has the spots of agents nd the package picked as its data.
        return [a.start_position for a in self.agents], self.has_package

    ############################################################################################################################
    def render(self):
        """
        Renders grid world.
        """
        for i in range(self.size):
            for j in range(self.size):
                if (i, j) == self.pick_up_position:
                    print("P", end=" ")
                elif (i, j) == self.delivery_position:
                    print("D", end=" ")
                elif any(agent.position == (i, j) for agent in pick_agents):
                    print("1", end=" ") #type 1 agent Pickup
                elif any(agent.position == (i, j) for agent in drop_agents):
                    print('2', end=" ") #type 2 agent delivery
                else:
                    print(".", end=" ")
            print()

    ############################################################################################################################
    def reset(self):
        self.initialize_agents()
        self.package_picked = False
        return self.state_of_the_world() #This works because all the method does is set all the world attributes to above.

    ############################################################################################################################
