In [8]:
import numpy as np

# Environment definition
n_states = 6  # Number of states
n_actions = 4  # Number of possible actions: Left, Right, Up, Down
goal_state = 3  # The terminal (goal) state

In [9]:
# Reward matrix
reward_matrix = np.array([
    [-1, -1, -1, -1],  # State 0
    [-1, -1, -1, -1],  # State 1
    [-1, -1, -1, -1],  # State 2
    [-1, -1, -1, 10],  # State 3 (goal state rewards +10 for reaching)
    [-1, -1, -1, -1],  # State 4
    [-1, -1, -1, -1]   # State 5
])

In [10]:
# Initialize Q-table
Q = np.zeros((n_states, n_actions))  # Q-table initialized to zeros

In [11]:
# Q-learning parameters
learning_rate = 0.1  # Alpha: Learning rate
discount_factor = 0.9  # Gamma: Discount factor for future rewards
episodes = 100  # Number of training episodes
epsilon = 0.1  # Epsilon for epsilon-greedy exploration

In [12]:
# Action mapping for clarity (optional)
actions = ["Left", "Right", "Up", "Down"]

In [None]:
# Q-Learning process
for episode in range(episodes):
    state = np.random.randint(0, n_states)  # Start at a random state
    
    while state != goal_state:  # Continue until the goal state is reached
        # Epsilon-greedy policy for action selection
        if np.random.rand() < epsilon:  # Exploration
            action = np.random.randint(0, n_actions)
        else:  # Exploitation
            action = np.argmax(Q[state])  # Choose the best action for current state
        
        # Define a simplified state transition
        next_state = state + 1 if action == 3 and state + 1 < n_states else state  # Move "Right" if possible
        
        # Get the reward for this action
        reward = reward_matrix[state, action]
        
        # Q-value update (Bellman equation)
        Q[state, action] += learning_rate * (
            reward + discount_factor * np.max(Q[next_state]) - Q[state, action]
        )
        
        # Move to the next state
        state = next_state

In [None]:
# Display the learned Q-table
print("Learned Q-table:")
print(Q)

In [None]:
# Derive the optimal policy from the Q-table
optimal_policy = [actions[np.argmax(Q[s])] for s in range(n_states)]
print("\nOptimal Policy:")
for s in range(n_states):
    print(f"State {s}: {optimal_policy[s]}")