In [1]:
import torch

## Set up the sample maze

Set up a maze in size of 10 × 10, and represent the maze in reward matrix. The reward matrix is a 10 × 10 matrix, and the reward of each grid is as follows:

- The reward of the grids in the border is -100.
- The reward of the grids in the middle is -1.
- The reward of the destination is 100.

Set up a stop matrix, which is a 10 × 10 bool matrix. The stop matrix is used to determine whether the agent can stop at the grid. 

The end point is (4,5).

In [2]:
reward_matrix = torch.tensor(data=[[-100,-100,-100,-100,-100,-100,-100,-100,-100,-100],
                                   [-100, -1, -100,  -1,-100,  -1,-100,  -1,-100,-100],
                                   [-100, -1,   -1,  -1,  -1,  -1,-100,  -1,  -1,-100],
                                   [-100, -1, -100,  -1,-100,  -1,-100,  -1,-100,-100],
                                   [-100, -1,   -1,  -1,  -1, 100,  -1,  -1,  -1,-100],
                                   [-100, -1, -100,-100,-100,  -1,-100,  -1,-100,-100],
                                   [-100, -1,   -1,  -1,  -1,  -1,-100,  -1,  -1,-100],
                                   [-100, -1, -100,  -1,-100,  -1,-100,  -1,-100,-100],
                                   [-100, -1,   -1,  -1,-100,  -1,  -1,  -1,  -1,-100],
                                   [-100,-100,-100,-100,-100,-100,-100,-100,-100,-100]], dtype=torch.int8)

stop_matrix = reward_matrix <= -100
"""Bool matrix, True if the cell is a stop cell
The barrier cells and the stop cell are stop cells"""

target_index = torch.tensor([4,5])

## Set up the Q-Matrix

4 actions: left, right, up, down.

Each action corresponds to a Q-Matrix, and the initial value is 0.

In [None]:
actions = torch.tensor([[0, -1], [0, 1], [-1, 0], [1, 0]], dtype=torch.int8)
n_action = actions.size(0)

q_matrix = torch.zeros_like(reward_matrix, dtype=torch.float32).repeat(n_action,1,1)
"""Q Matrix for each action"""

## Q-Learning related functions

### Decide the next action

Set up the epsilon value, which is used to determine whether to explore or exploit (choose the action with the highest Q-Value).

In [4]:
import random

GREEDY = 0.7
"""The rate of greedy action"""

def choose_action_id(state : torch.Tensor, greedy_rate = GREEDY) -> int:
    """
    Choose an action based on the given state and greedy rate.
    """
    
    # ramdom choose action
    if random.random() > greedy_rate:
        return random.randint(0, n_action - 1)
    # greedy choose action
    else:
        q_values = q_matrix[:, state[0], state[1]] # get q values of all actions
        max_value = torch.max(q_values) # get max q value of all actions
        max_indices = torch.where(q_values == max_value)[0].tolist() # get indices of max q value
        return random.choice(max_indices) # choose one of the max q value
    
def get_next_state(state : torch.Tensor, action_id : int) -> torch.Tensor:
    """
    Get the next state based on the current state and the action.
    """
    action = actions[action_id]
    next_state = state + action
    return next_state

### Update Q-Matrix

$$
Q(s, a) \leftarrow Q(s, a) + \alpha \cdot (R(s, a) + \gamma \cdot \max Q(s', a') - Q(s, a))
$$

Where:
- $Q(s, a)$ is the Q-value of action $a$ in state $s$
- $\alpha$ is the learning rate
- $R(s, a)$ is the reward of action $a$ in state $s$
- $\gamma$ is the discount factor
- $\max Q(s', a')$ is the maximum Q-value of the next action $a'$ in the next state $s'$

In [5]:
DISCOUNT = 0.9
LR = 0.1

def update(state:torch.Tensor, action_id: int, next_state: torch.Tensor, discount = DISCOUNT, lr = LR):
    reward = reward_matrix[next_state[0], next_state[1]]
    q_old = q_matrix[action_id, state[0], state[1]]
    
    if stop_matrix[next_state[0], next_state[1]]:
        q_new = reward
    else:
        q_next_max = torch.max(q_matrix[:, next_state[0], next_state[1]])
        q_new = reward + discount * q_next_max
    
    q_matrix[action_id, state[0], state[1]] += lr * (q_new - q_old)

## Class `Agent`

Starting from the initial state, continuously update the Q-Matrix until reaching the target state or exhausting all steps.

When hitting a wall, do not update the state. After hitting the wall, try selecting another action from the current state.

In [6]:
MAX_STEP = 100

In [7]:
class Agent:
    def __init__(self, start_state: torch.Tensor, restriction_matrix: torch.Tensor, max_step = MAX_STEP):
        self.state = start_state
        self.restriction_matrix = restriction_matrix
        self.max_step = max_step
        self.states = [start_state]
        
    def go(self, start_state: torch.Tensor, valid: bool = False, debug: bool = False) -> list:
        """
        Start from the start state and go to the target state.
        """
        states = [start_state]
        state = states[-1]
        # while not stop_matrix[state[0], state[1]] and len(states) <= MAX_STEP:
        while not torch.allclose(state, target_index) and len(states) <= MAX_STEP:
            if valid: # if in valid mode, choose the best action
                action_id = choose_action_id(state, 1)
            else:   # if not in valid mode, choose action based on greedy rate
                action_id = choose_action_id(state)
            # Get next state based on the action
            next_state = get_next_state(state, action_id)
            if not valid: # if not in valid mode, update the q matrix
                update(state, action_id, next_state)

            if not stop_matrix[next_state[0], next_state[1]] + self.restriction_matrix[next_state[0], next_state[1]]: # if not hit the wall or restricted
                state = next_state
            else: # if hit the wall
                if debug:
                    print(f'{next_state}, hit the wall or restricted')
            states.append(next_state) # add the next state to the states list
        return states


## Define agents

Define 4 agents, and restrict them in areas.

In [None]:
q_matrix = torch.zeros_like(q_matrix)

restriction_matrix_1 = torch.zeros_like(reward_matrix, dtype=torch.bool)
restriction_matrix_1[6, :] = 1; restriction_matrix_1[:,6] = 1

restriction_matrix_2 = torch.zeros_like(reward_matrix, dtype=torch.bool)
restriction_matrix_2[6, :] = 1; restriction_matrix_2[:,3] = 1

restriction_matrix_3 = torch.zeros_like(reward_matrix, dtype=torch.bool)
restriction_matrix_3[3, :] = 1; restriction_matrix_3[:,6] = 1

restriction_matrix_4 = torch.zeros_like(reward_matrix, dtype=torch.bool)
restriction_matrix_4[3, :] = 1; restriction_matrix_4[:,3] = 1


state_1 = torch.tensor([1, 1]) # start state 1
state_2 = torch.tensor([2, 8]) # start state 2
state_3 = torch.tensor([8, 1]) # start state 3
state_4 = torch.tensor([8, 8]) # start state 4

r1 = Agent(state_1, restriction_matrix_1)
r2 = Agent(state_2, restriction_matrix_2)
r3 = Agent(state_3, restriction_matrix_3)
r4 = Agent(state_4, restriction_matrix_4)

## Start training

In [None]:
import tqdm
for _ in tqdm.trange(100):
    r1.go(state_1)
    r2.go(state_2)
    r3.go(state_3)
    r4.go(state_4)

## Testing

### Test one agent

In [None]:
restriction_matrix_test = torch.zeros_like(reward_matrix, dtype=torch.bool) # disable all restrictions
state_test = torch.tensor([6, 8]) # start state

r_test = Agent(state_test, restriction_matrix_test) # create an agent for testing
path = r_test.go(state_test, True) # get the path from the start state to the target state

last_pos = path[0]
for step in path:
    if not stop_matrix[step[0], step[1]]:
        last_pos = step
        print(step)
    else:
        print(f'{step}, hit the wall this time, back to pos: {last_pos}')

### Test crossing areas

In [12]:
def get_path_staging(start_state : torch.Tensor, target_state: torch.Tensor) -> list:
    restriction_matrix = torch.zeros_like(reward_matrix, dtype=torch.bool)
    r_start = Agent(start_state, restriction_matrix)
    path_from_start = r_start.go(start_state, True)
    
    r_target = Agent(target_state, restriction_matrix)
    path_from_target = r_target.go(target_state, True)
    
    return path_from_start[:-1] + list(reversed(path_from_target))

In [None]:
get_path_staging(torch.tensor([6,8]), torch.tensor([4,1]))

In [None]:
get_path_staging(torch.tensor([3,1]), torch.tensor([2,8]))