In [1]:
import numpy as np
import random


In [2]:

# Environment setup
GRID_SIZE = 5
ACTIONS = ['left', 'right']
action_map = {'left': -1, 'right': 1}


In [3]:

# Q-table: rows = states, columns = actions
Q = np.zeros((GRID_SIZE, len(ACTIONS)))


In [4]:

# Hyperparameters
alpha = 0.1      # Learning rate
gamma = 0.9      # Discount factor
epsilon = 0.2    # Exploration rate
episodes = 1000


In [5]:

# Training loop
for episode in range(episodes):
    state = 0  # Start position

    while state != GRID_SIZE - 1:
        # Choose action (explore or exploit)
        if random.random() < epsilon:
            action_idx = random.choice([0, 1])
        else:
            action_idx = np.argmax(Q[state])

        action = ACTIONS[action_idx]
        next_state = state + action_map[action]

        # Prevent out-of-bounds
        next_state = max(0, min(GRID_SIZE - 1, next_state))

        # Reward logic
        reward = 10 if next_state == GRID_SIZE - 1 else -1

        # Q-learning update
        Q[state, action_idx] = Q[state, action_idx] + alpha * (
            reward + gamma * np.max(Q[next_state]) - Q[state, action_idx]
        )

        state = next_state  # Move to next state


In [6]:

# Display learned Q-table
print("Final Q-table:\n", Q)


Final Q-table:
 [[ 3.12198293  4.58      ]
 [ 3.12196776  6.2       ]
 [ 4.5799459   8.        ]
 [ 6.19997558 10.        ]
 [ 0.          0.        ]]


In [7]:

# Test the agent
print("\n🚶 Agent path from start to goal:")
state = 0
path = [state]
while state != GRID_SIZE - 1:
    action_idx = np.argmax(Q[state])
    action = ACTIONS[action_idx]
    state += action_map[action]
    state = max(0, min(GRID_SIZE - 1, state))
    path.append(state)

print("Path taken:", path)



🚶 Agent path from start to goal:
Path taken: [0, 1, 2, 3, 4]
