In [46]:
import torch

## Set up the sample maze

Set up a maze in size of 10 × 10, and represent the maze in reward matrix. The reward matrix is a 10 × 10 matrix, and the reward of each grid is as follows:

- The reward of the grids in the border is -100.
- The reward of the grids in the middle is -1.
- The reward of the destination is 100.

Set up a stop matrix, which is a 10 × 10 bool matrix. The stop matrix is used to determine whether the agent can stop at the grid. 

The end point is (4,5).

In [47]:
reward_matrix = torch.tensor(data=[[-100,-100,-100,-100,-100,-100,-100,-100,-100,-100],
                                   [-100, -1, -100,  -1,-100,  -1,-100,  -1,-100,-100],
                                   [-100, -1,   -1,  -1,  -1,  -1,-100,  -1,  -1,-100],
                                   [-100, -1, -100,  -1,-100,  -1,-100,  -1,-100,-100],
                                   [-100, -1,   -1,  -1,  -1, 100,  -1,  -1,  -1,-100],
                                   [-100, -1, -100,-100,-100,  -1,-100,  -1,-100,-100],
                                   [-100, -1,   -1,  -1,  -1,  -1,-100,  -1,  -1,-100],
                                   [-100, -1, -100,  -1,-100,  -1,-100,  -1,-100,-100],
                                   [-100, -1,   -1,  -1,-100,  -1,  -1,  -1,  -1,-100],
                                   [-100,-100,-100,-100,-100,-100,-100,-100,-100,-100]], dtype=torch.int8)

stop_matrix = reward_matrix <= -10
"""Bool matrix, True if the cell is a stop cell
The barrier cells and the stop cell are stop cells"""

stop_matrix[4, 5] = True        # Set Terminal as stop cell

## Set up the Q-Matrix

4 actions: left, right, up, down.

Each action corresponds to a Q-Matrix, and the initial value is 0.

In [None]:
actions = torch.tensor([[0, -1], [0, 1], [-1, 0], [1, 0]], dtype=torch.int8)
n_action = actions.size(0)

q_matrix = torch.zeros_like(reward_matrix, dtype=torch.float32).repeat(n_action,1,1)
"""Q Matrix for each action"""

## Q-Learning related functions

### Decide the next action

Set up the epsilon value, which is used to determine whether to explore or exploit (choose the action with the highest Q-Value).

In [49]:
import random

GREEDY = 0.7
"""The rate of greedy action"""

def choose_action_id(state : torch.Tensor, greedy_rate = GREEDY) -> int:
    """
    Choose an action based on the given state and greedy rate.
    """
    
    # ramdom choose action
    if random.random() > greedy_rate:
        return random.randint(0, n_action - 1)
    # greedy choose action
    else:
        q_values = q_matrix[:, state[0], state[1]] # get q values of all actions
        max_value = torch.max(q_values) # get max q value of all actions
        max_indices = torch.where(q_values == max_value)[0].tolist() # get indices of max q value
        return random.choice(max_indices) # choose one of the max q value
    
def get_next_state(state : torch.Tensor, action_id : int, actions_ = actions) -> torch.Tensor:
    """
    Get the next state based on the current state and the action.
    """
    action = actions_[action_id]
    next_state = state + action
    return next_state

### Update Q-Matrix

$$
Q(s, a) \leftarrow Q(s, a) + \alpha \cdot (R(s, a) + \gamma \cdot \max Q(s', a') - Q(s, a))
$$

Where:
- $Q(s, a)$ is the Q-value of action $a$ in state $s$
- $\alpha$ is the learning rate
- $R(s, a)$ is the reward of action $a$ in state $s$
- $\gamma$ is the discount factor
- $\max Q(s', a')$ is the maximum Q-value of the next action $a'$ in the next state $s'$

In [50]:
DISCOUNT = 0.9
LR = 0.1

def update(state:torch.Tensor, action_id: int, next_state: torch.Tensor, discount = DISCOUNT, lr = LR):
    reward = reward_matrix[next_state[0], next_state[1]]
    q_old = q_matrix[action_id, state[0], state[1]]
    
    if stop_matrix[next_state[0], next_state[1]]:
        q_new = reward
    else:
        q_next_max = torch.max(q_matrix[:, next_state[0], next_state[1]])
        q_new = reward + discount * q_next_max
    
    q_matrix[action_id, state[0], state[1]] += lr * (q_new - q_old)

## Class `Agent`

Starting from the initial state, continuously update the Q-Matrix until reaching the target state or exhausting all steps.

When hitting a wall, do not update the state. After hitting the wall, try selecting another action from the current state.

In [51]:
MAX_STEP = 50

def go(start_state: torch.Tensor, valid_mode: bool = False, actions_ = actions):
    states = [start_state]
    state = states[-1]
    while not stop_matrix[state[0], state[1]] and len(states) <= MAX_STEP:
        if valid_mode:
            action_id = choose_action_id(state, 1)
        else:
            action_id = choose_action_id(state)
        next_state = get_next_state(state, action_id, actions_)
        if not valid_mode:
            update(state, action_id, next_state)
        state = next_state
        states.append(state)
    return states

## Define agents

Define one or two agents, starting from different initial states.

In [None]:
q_matrix = torch.zeros_like(q_matrix)

state_1 = torch.tensor([8, 1])
state_2 = torch.tensor([8, 8]) # test for multiple start points

import tqdm
for _ in tqdm.trange(500):
    go(torch.tensor(state_1))
    go(torch.tensor(state_2)) # test for multiple start points

## Testing

In [None]:
state_3 = torch.tensor([8,8])
path = go(state_3, True)

for step in path:
    print(step)