# A project in Multi-agent Systems: Stage 1

## Reward Structure

Our reward structure comprises the following. The agent gets

- +5 points for reaching location A.
- +20 points for reaching location B when carrying the item 
    (this is possible since we can track if the agent has picked up the item or not)
- -1 point for each step taken.
- 0 points for standing still.


## Imports

In [15]:
import numpy as np

## Environment Setup

In [169]:
class GridWorld:
    def __init__(self, grid_size=5, target_position=None):
        self.grid_size = grid_size
        if target_position is None:
            target_position = (np.random.randint(0, self.grid_size - 1), np.random.randint(0, self.grid_size - 1))
        elif target_position[0] >= grid_size or target_position[1] >= grid_size:
                raise ValueError(f"Target position must be within the grid size of {grid_size-1}x{grid_size-1}")
        self.target_position = target_position
        self.reset()

    def reset(self):
        self.reward = 0
        while True:
            self.agent_position = (1,0)
            #self.agent_position = (np.random.randint(0, self.grid_size), np.random.randint(0, self.grid_size))
            self.item_position = (2,0)
            #self.item_position = (np.random.randint(0, self.grid_size), np.random.randint(0, self.grid_size))
            if (self.agent_position != self.item_position and 
                self.agent_position != self.target_position and 
                self.item_position != self.target_position):
                break
        self.carrying_item = False
        self.done = False
        return self._get_state()
    
    def _get_state(self):
        return (self.agent_position, self.item_position, self.carrying_item, self.reward, self.done)

    def step(self, action):
        x, y = self.agent_position
        # we represent x as the vertical position (rows). y is the horizontal position (columns).
        # we wont let the agent move outside the grid
        if action == 0: # north
            pos = (max(x-1, 0), y)
        elif action == 1: # south
            pos = (min(x+1, self.grid_size - 1), y)
        elif action == 2: # west
            pos = (x, max(y-1, 0))
        elif action == 3:
            pos = (x, min(y+1, self.grid_size - 1))
        else:
            print(action)
            raise ValueError("The action was invalid. Choose a number between 0 to 3")
        
        self.agent_position = pos

        # took a step, so reward - 1
        self.reward -=1

        # Checking if we picked up the item        
        if self.agent_position == self.item_position and not self.carrying_item:
            self.reward += 5  # +5 points for reaching location A
            self.carrying_item = True

        # Checking if we're done
        if self.agent_position == self.target_position and self.carrying_item:
            self.reward += 10 # +10 points for reaching location B when carrying the item
            self.done = True


        return self._get_state(), self.reward, self.done

In [170]:
env = GridWorld(grid_size=5, target_position = (3,0))
state = env.reset()

 #test here. you can change parameters in the reset function for proper testing
actions = [1, 1, 1, 1, 1, 1, 1] # see the step function for the mapping of the movements
for action in actions:
    state, reward, done = env.step(action)
    print(f"agent pos: {state[0]}, item_pos: {state[1]}, carrying_item: {state[2]}, reward: {state[3]}, done: {state[4]}")
    if done:
        break

agent pos: (2, 0), item_pos: (2, 0), carrying_item: True, reward: 4, done: False
agent pos: (3, 0), item_pos: (2, 0), carrying_item: True, reward: 13, done: True


## Q-learning Algorithm

In [203]:
class QLearner:
    def __init__(self, env, alpha=0.1, gamma=0.9, epsilon=0.01, episodes=10_000):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.episodes = episodes
        self.q_table = {} # dict for the q_values. It'll contain 2 states & actions
    def get_q_value(self, state, action):
        print(state)
        print(f"action {action}")
        return self.q_table.get(state, {}).get(action, 0.0) # return empty dictionary if no state or action. this might be our issue
    
    def set_q_value(self, state, action, value):
        if state not in self.q_table:
            self.q_table[state] = {}
        self.q_table[state][action] = value
    
    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice([0, 1, 2, 3]) 
        else:
            q_values = {a: self.get_q_value(state, a) for a in [0, 1, 2, 3]} # get q_value for each action
            max_q = max(q_values.values())
            action = np.random.choice([a for a in q_values if q_values[a] == max_q]) # if two actions have same q_value, we pick one at random
            return action
    
    def learn(self):
        for episode in range(1):
            state = self.env.reset()
            done = False
            total_reward = 0
            step = 0
            if episode % 100 == 0:
                print(f"Running episode {episode}/{self.episodes}")
                print(f"Init state: agent pos: {self.env.agent_position} item pos: {self.env.item_position} target pos: {self.env.target_position} reward: {self.env.reward}")
            while not done:
                action = self.choose_action(state)
                next_state, reward, done = self.env.step(action)
                
                # update q values
                old_q_value = self.get_q_value(state, action)
                print(old_q_value)
                future_q_value = max([self.get_q_value(next_state, a) for a in [0, 1, 2, 3]]) # something is wrong with this one
                print(future_q_value)
                new_q_value = old_q_value + self.alpha * (reward + self.gamma * future_q_value - old_q_value) # formula from lecture
                
                self.set_q_value(state, action, new_q_value)
                
                state = next_state 
                total_reward += reward
                step += 1

            if episode % 100 == 0 and self.env.agent_position == self.env.target_position:
                print(f"finished episode {episode} with agent pos: {self.env.agent_position} target pos: {self.env.target_position} reward: {self.env.reward}")

In [204]:
# Usage example
env = GridWorld(grid_size=5, target_position=(3, 0))
q_learner = QLearner(env)
q_learner.learn()


Running episode 0/10000
Init state: agent pos: (1, 0) item pos: (2, 0) target pos: (3, 0) reward: 0
((1, 0), (2, 0), False, 0, False)
action 0
((1, 0), (2, 0), False, 0, False)
action 1
((1, 0), (2, 0), False, 0, False)
action 2
((1, 0), (2, 0), False, 0, False)
action 3
((1, 0), (2, 0), False, 0, False)
action 3
0.0
((1, 1), (2, 0), False, -1, False)
action 0
((1, 1), (2, 0), False, -1, False)
action 1
((1, 1), (2, 0), False, -1, False)
action 2
((1, 1), (2, 0), False, -1, False)
action 3
0.0
((1, 1), (2, 0), False, -1, False)
action 0
((1, 1), (2, 0), False, -1, False)
action 1
((1, 1), (2, 0), False, -1, False)
action 2
((1, 1), (2, 0), False, -1, False)
action 3
((1, 1), (2, 0), False, -1, False)
action 0
0.0
((0, 1), (2, 0), False, -2, False)
action 0
((0, 1), (2, 0), False, -2, False)
action 1
((0, 1), (2, 0), False, -2, False)
action 2
((0, 1), (2, 0), False, -2, False)
action 3
0.0
((0, 1), (2, 0), False, -2, False)
action 0
((0, 1), (2, 0), False, -2, False)
action 1
((0, 1), 

## Training Phase

## Evaluation Phase

## Visualisation

## Conclusion