In [6]:
import numpy as np
import random

# Define the environment
grid_size = 4
goal_state = (3, 3)
obstacles = [(1, 1), (2, 2)]
actions = ['up', 'down', 'left', 'right']

# Helper functions
def is_valid_state(state):
    return (
        0 <= state[0] < grid_size and 
        0 <= state[1] < grid_size and 
        state not in obstacles
    )

def get_next_state(state, action):
    if action == 'up':
        next_state = (state[0] - 1, state[1])
    elif action == 'down':
        next_state = (state[0] + 1, state[1])
    elif action == 'left':
        next_state = (state[0], state[1] - 1)
    elif action == 'right':
        next_state = (state[0], state[1] + 1)
    else:
        next_state = state

    return next_state if is_valid_state(next_state) else state

def get_reward(state):
    return 10 if state == goal_state else -1

# Initialize Q-Table
q_table = {}
for i in range(grid_size):
    for j in range(grid_size):
        q_table[(i, j)] = {a: 0 for a in actions}

# Training parameters
episodes = 500
learning_rate = 0.1
discount_factor = 0.9
epsilon = 0.1

# Q-Learning algorithm
for episode in range(episodes):
    state = (0, 0)  # Start state
    done = False

    while not done:
        # Choose action: Îµ-Greedy
        if random.uniform(0, 1) < epsilon:
            action = random.choice(actions)
        else:
            action = max(q_table[state], key=q_table[state].get)

        # Take action
        next_state = get_next_state(state, action)
        reward = get_reward(next_state)

        # Update Q-value
        q_table[state][action] += learning_rate * (
            reward + discount_factor * max(q_table[next_state].values()) - q_table[state][action]
        )

        # Move to next state
        state = next_state

        # Check if goal is reached
        if state == goal_state:
            done = True

print("Q-Table after training:")
for state, actions in q_table.items():
    print(state, actions)

# Test the policy
print("\nTesting Optimal Policy:")
state = (0, 0)
path = [state]

while state != goal_state:
    action = max(q_table[state], key=q_table[state].get)
    state = get_next_state(state, action)
    path.append(state)

print("Optimal Path:", path)


Q-Table after training:
(0, 0) {'up': -0.9118628592883264, 'down': 1.80979999999998, 'left': -0.1444590790238352, 'right': -2.1199025107974063}
(0, 1) {'up': -1.798713721018097, 'down': -1.737660653908409, 'left': -1.6870615838083391, 'right': -0.6399944385122414}
(0, 2) {'up': -0.9640418879973849, 'down': -0.965447653582764, 'left': -1.2828526442924901, 'right': 1.5567401981145639}
(0, 3) {'up': -0.41199317955240256, 'down': 4.341863975362333, 'left': -0.5254557836312632, 'right': -0.44857403083725556}
(1, 0) {'up': -0.361176909860538, 'down': 3.1219999999999857, 'left': 0.708677152991894, 'right': 0.3211472309597183}
(1, 1) {'up': 0, 'down': 0, 'left': 0, 'right': 0}
(1, 2) {'up': -0.6552228548400392, 'down': -0.48843811000000004, 'left': -0.490099501, 'right': 0.5899618096216224}
(1, 3) {'up': -0.107281128739345, 'down': 7.287380786986903, 'left': -0.32185359100000005, 'right': 0.2840951055399753}
(2, 0) {'up': 0.1735146325249885, 'down': 4.579999999999986, 'left': 1.632002625626897