In [5]:
import numpy as np

In [6]:
grid_size = 100
grid = np.zeros((grid_size, grid_size))
start, goal = (0, 0), (grid_size - 1, grid_size - 1)
obstacles = set((np.random.randint(grid_size), np.random.randint(grid_size)) for _ in range(200))
for obs in obstacles:
    grid[obs] = -1

In [7]:
def is_valid_move(state):
    x, y = state
    return 0 <= x < grid_size and 0 <= y < grid_size and grid[x, y] != -1

def get_neighbors(x, y):
    moves = [(x-1, y), (x+1, y), (x, y-1), (x, y+1)]
    return [m for m in moves if is_valid_move(m)]

gamma, threshold = 0.9, 0.01
values = np.zeros((grid_size, grid_size))

In [8]:
# Value Iteration
for _ in range(500):
    new_values = np.copy(values)
    for x in range(grid_size):
        for y in range(grid_size):
            if (x, y) == goal or (x, y) in obstacles:
                continue
            neighbors = get_neighbors(x, y)
            if neighbors:
                new_values[x, y] = max(-1 + gamma * values[nx, ny] for nx, ny in neighbors)
    if np.max(np.abs(new_values - values)) < threshold:
        break
    values = new_values

In [9]:
# Print the optimal policy from Value Iteration
print("Optimal policy from Value Iteration (0=up, 1=down, 2=left, 3=right):")
policy_vi = np.full((grid_size, grid_size), -1)
actions = [(-1, 0), (1, 0), (0, -1), (0, 1)]
for x in range(grid_size):
    for y in range(grid_size):
        if (x, y) == goal or (x, y) in obstacles:
            policy_vi[x, y] = -1
        else:
            neighbors = get_neighbors(x, y)
            if neighbors:
                policy_vi[x, y] = np.argmax([-1 + gamma * values[nx, ny] for nx, ny in neighbors])
print(policy_vi)

Optimal policy from Value Iteration (0=up, 1=down, 2=left, 3=right):
[[ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 ...
 [ 0  0  0 ...  1  1  1]
 [ 0  0  0 ...  1  1  1]
 [ 0  0  0 ...  2  2 -1]]


In [10]:
# Q-Learning
q_table = np.zeros((grid_size, grid_size, 4))
epsilon, alpha = 0.1, 0.1

for _ in range(5):
    state = start
    while state != goal:
        x, y = state
        if np.random.rand() < epsilon:
            action = np.random.choice(4)
        else:
            action = np.argmax(q_table[x, y])
        dx, dy = actions[action]
        next_state = (x + dx, y + dy)
        if is_valid_move(next_state):
            nx, ny = next_state
            reward = 10 if next_state == goal else -1
            best_next_action = np.max(q_table[nx, ny])
            q_table[x, y, action] += alpha * (reward + gamma * best_next_action - q_table[x, y, action])
            state = next_state
        else:
            q_table[x, y, action] += alpha * (-1 - q_table[x, y, action])

# Print the policy derived from Q-learning
print("Optimal policy from Q-learning (0=up, 1=down, 2=left, 3=right):")
policy_q = np.argmax(q_table, axis=2)
print(policy_q)


Optimal policy from Q-learning (0=up, 1=down, 2=left, 3=right):
[[0 0 2 ... 0 3 0]
 [2 1 2 ... 3 3 3]
 [2 0 0 ... 1 3 3]
 ...
 [1 3 3 ... 2 3 1]
 [3 0 3 ... 1 3 1]
 [3 2 3 ... 0 3 0]]
