In [1]:
import numpy as np

# Define constants
GRID_SIZE = 3
NUM_EPISODES = 1000
MAX_STEPS = 50
LEARNING_RATE = 0.1
DISCOUNT_FACTOR = 0.9
EPSILON = 0.1

# Define actions
ACTIONS = ['UP', 'DOWN', 'LEFT', 'RIGHT']
NUM_ACTIONS = len(ACTIONS)

# Define states
STATES = [(i, j) for i in range(GRID_SIZE) for j in range(GRID_SIZE)]
NUM_STATES = len(STATES)

# Define rewards and obstacles
REWARDS = {
    (2, 2): 10,  # Goal
    (0, 2): -10,  # Obstacle
}
OBSTACLES = [(0, 2)]

# Initialize Q-values
Q_values = np.zeros((NUM_STATES, NUM_ACTIONS))

# Helper functions
def get_next_state(state, action):
    if action == 'UP':
        return (max(0, state[0]-1), state[1])
    elif action == 'DOWN':
        return (min(GRID_SIZE-1, state[0]+1), state[1])
    elif action == 'LEFT':
        return (state[0], max(0, state[1]-1))
    elif action == 'RIGHT':
        return (state[0], min(GRID_SIZE-1, state[1]+1))

def select_action(state):
    if np.random.rand() < EPSILON:
        return np.random.choice(ACTIONS)
    else:
        return ACTIONS[np.argmax(Q_values[state])]

# Training
for _ in range(NUM_EPISODES):
    state = (0, 0)
    for _ in range(MAX_STEPS):
        action = select_action(STATES.index(state))
        next_state = get_next_state(state, action)
        reward = REWARDS.get(next_state, 0)
        if next_state in OBSTACLES:
            next_state = (0, 0)  # Reset to start if hitting an obstacle
        max_next_Q = np.max(Q_values[STATES.index(next_state)])
        Q_values[STATES.index(state), ACTIONS.index(action)] += LEARNING_RATE * (reward + DISCOUNT_FACTOR * max_next_Q - Q_values[STATES.index(state), ACTIONS.index(action)])
        state = next_state
        if state == (2, 2):  # Reached goal
            break

# Testing
def test_agent():
    state = (0, 0)
    path = [state]
    while state != (2, 2):
        action = ACTIONS[np.argmax(Q_values[STATES.index(state)])]
        next_state = get_next_state(state, action)
        if next_state in OBSTACLES:
            next_state = (0, 0)  # Reset to start if hitting an obstacle
        path.append(next_state)
        state = next_state
    return path

# Analyze results
path = test_agent()
print("Optimal path found by the agent:", path)

Optimal path found by the agent: [(0, 0), (0, 1), (1, 1), (1, 2), (2, 2)]
