In [1]:
import numpy as np

# Define maze environment
GRID_HEIGHT = 4
GRID_WIDTH = 5
START = (3, 0)
FLAG = (0, 4)
FIRE = (1, 4)
WALLS = [(1, 1), (1, 2), (1, 3)]

# Define Q-learning parameters
LEARNING_RATE = 0.1
DISCOUNT_FACTOR = 0.9
EXPLORATION_PROB = 1.0
EXPLORATION_DECAY = 0.001
EPISODES = 1000

# Initialize Q-table
q_table = np.zeros((GRID_HEIGHT * GRID_WIDTH, 4))

# Helper functions
def get_state_index(row, col):
    return row * GRID_WIDTH + col

def get_coords_from_index(index):
    return index // GRID_WIDTH, index % GRID_WIDTH

def is_valid_state(row, col):
    return 0 <= row < GRID_HEIGHT and 0 <= col < GRID_WIDTH and (row, col) not in WALLS

def get_reward(row, col):
    if (row, col) == FLAG:
        return 10
    elif (row, col) == FIRE:
        return -10
    else:
        return -1

# Q-learning algorithm
state = START
for episode in range(EPISODES):
    # Choose action
    if np.random.uniform(0, 1) < EXPLORATION_PROB:
        action = np.random.randint(0, 4)  # 0: up, 1: down, 2: left, 3: right
    else:
        state_index = get_state_index(*state)
        action = np.argmax(q_table[state_index])

    # Take action and observe reward
    row, col = state
    if action == 0 and is_valid_state(row - 1, col):
        new_state = (row - 1, col)
    elif action == 1 and is_valid_state(row + 1, col):
        new_state = (row + 1, col)
    elif action == 2 and is_valid_state(row, col - 1):
        new_state = (row, col - 1)
    elif action == 3 and is_valid_state(row, col + 1):
        new_state = (row, col + 1)
    else:
        new_state = state  # Stay in the same state if action is invalid
    reward = get_reward(*new_state)

    # Update Q-value
    state_index = get_state_index(*state)
    new_state_index = get_state_index(*new_state)
    q_table[state_index][action] = q_table[state_index][action] + LEARNING_RATE * (reward + DISCOUNT_FACTOR * np.max(q_table[new_state_index]) - q_table[state_index][action])

    # Update state and exploration probability
    state = new_state
    EXPLORATION_PROB *= (1 - EXPLORATION_DECAY)

    # Check if game is over
    if state == FLAG or state == FIRE:
        state = START  # Reset to start for the next episode

# Print the final Q-table
print("Final Q-table:")
print(q_table)

Final Q-table:
[[-0.95542428 -1.38586318 -0.86936215 -0.94788355]
 [-0.6751121  -0.65436142 -0.69335356 -0.61823391]
 [-0.4909078  -0.3531268  -0.34416245  0.5066819 ]
 [-0.019      -0.19       -0.109       4.68559   ]
 [ 0.          0.          0.          0.        ]
 [-1.34951102 -1.51129082 -1.40798317 -1.43160662]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [-1.64911407 -1.69984497 -1.6119771  -1.8379896 ]
 [-1.77040595 -1.82960025 -1.76027012 -1.80767124]
 [-1.73334001 -1.7647024  -1.69874994 -1.77914134]
 [-1.79749391 -1.69619813 -1.70534752 -1.62155421]
 [-6.5132156  -1.46328817 -1.53463495 -1.48470414]
 [-1.92384877 -1.95279758 -2.01504259 -2.03883711]
 [-1.9164728  -1.72089043 -1.82210779 -1.83766174]
 [-1.79626432 -1.78462878 -1.76779533 -1.72538331]
 [-1.65056572 -1.73074879 -1.70072668 -1.6776037 ]
 [-1.61438522 -1