In [6]:
import numpy as np
import random

# Define the environment
grid = np.array([
    [0,  0,  0,  0],
    [0, -1,  0, -1],
    [0,  0,  0, -1],
    [0, -1,  0,  1],
])

# Define the Q-learning parameters
num_episodes = 1000
max_steps = 100
alpha = 0.1
gamma = 0.9
epsilon = 0.1

# Initialize the Q-table
num_states = grid.size
num_actions = 4
q_table = np.zeros((num_states, num_actions))

# Helper function to get the row and column indices of a state
def get_state_indices(state):
    return np.unravel_index(state, grid.shape)

# Helper function to get the next state and reward for a given action
def get_next_state_and_reward(state, action):
    row, col = get_state_indices(state)
    if action == 0:  # up
        row = max(row - 1, 0)
    elif action == 1:  # down
        row = min(row + 1, grid.shape[0] - 1)
    elif action == 2:  # left
        col = max(col - 1, 0)
    elif action == 3:  # right
        col = min(col + 1, grid.shape[1] - 1)
    next_state = np.ravel_multi_index((row, col), grid.shape)
    reward = grid[row, col]
    return next_state, reward

# Run the Q-learning algorithm
for episode in range(num_episodes):
    state = np.random.randint(num_states)
    for step in range(max_steps):
        if random.random() < epsilon:
            action = random.randint(0, num_actions - 1)
        else:
            action = np.argmax(q_table[state])
        next_state, reward = get_next_state_and_reward(state, action)
        q_table[state, action] += alpha * (reward + gamma * np.max(q_table[next_state]) - q_table[state, action])
        state = next_state
        if reward == 1:  # reached the goal
            break

# Print the final policy (i.e., the action with highest Q-value for each state)
policy = np.argmax(q_table, axis=1)
print("Policy:")
print(np.reshape(policy, grid.shape))


Policy:
[[3 3 3 1]
 [0 0 0 1]
 [0 2 3 1]
 [0 0 3 1]]
