In [1]:
import numpy as np


In [2]:
# Constants
GRID_HEIGHT = 5
GRID_WIDTH = 5
A_POSITION = (0, 1)  # Position of special state A
A_PRIME_POSITION = (4, 1)  # Destination of state A with reward +10
B_POSITION = (0, 3)  # Position of special state B
B_PRIME_POSITION = (2, 3)  # Destination of state B with reward +5
GAMMA = 0.9  # Discount factor
THETA = 1e-4  # Convergence threshold for Value Iteration
ACTIONS = ['U', 'D', 'L', 'R']  # Available actions: Up, Down, Left, Right
ACTION_DELTA = {
    'U': (-1, 0),  # Move up
    'D': (1, 0),   # Move down
    'L': (0, -1),  # Move left
    'R': (0, 1),   # Move right
}


In [3]:
def is_valid_state(state):
    """
    Check if the state is within grid boundaries.

    Args:
        state (tuple): The state to check (i, j).

    Returns:
        bool: True if the state is valid, False otherwise.
    """
    i, j = state
    return 0 <= i < GRID_HEIGHT and 0 <= j < GRID_WIDTH


In [4]:
def get_next_state_and_reward(state, action):
    """
    Given a state and an action, return the next state and reward.

    Args:
        state (tuple): The current state (i, j).
        action (str): The action taken ('U', 'D', 'L', 'R').

    Returns:
        next_state (tuple): The next state after taking the action.
        reward (float): The reward received after taking the action.
    """
    if state == A_POSITION:
        # Special transition for state A with reward +10
        return A_PRIME_POSITION, 10.0
    if state == B_POSITION:
        # Special transition for state B with reward +5
        return B_PRIME_POSITION, 5.0

    delta = ACTION_DELTA[action]
    next_state = (state[0] + delta[0], state[1] + delta[1])

    if not is_valid_state(next_state):
        # If next state is off the grid, stay in the same state and penalize
        return state, -1.0

    return next_state, 0.0  # Reward is 0 for all standard transitions


In [5]:
def value_iteration():
    """
    Perform Value Iteration to compute the optimal state-value function.

    Returns:
        V (numpy.ndarray): The optimal state-value function as a 2D numpy array.
    """
    V = np.zeros((GRID_HEIGHT, GRID_WIDTH))
    while True:
        delta = 0  # Initialize delta for convergence checking
        for i in range(GRID_HEIGHT):
            for j in range(GRID_WIDTH):
                state = (i, j)
                v = V[state]
                values = []
                # Evaluate all possible actions
                for action in ACTIONS:
                    next_state, reward = get_next_state_and_reward(state, action)
                    value = reward + GAMMA * V[next_state]
                    values.append(value)
                V[state] = max(values)  # Update the state-value with the maximum action value
                delta = max(delta, abs(v - V[state]))
        if delta < THETA:
            # Value function has converged
            break
    return V


In [6]:
def main():
    """
    Main function to compute and display the optimal state-value function.
    """
    V = value_iteration()
    # Print the formatted optimal state-value function
    print("                  v_*")
    print("       " + "  ".join([f"Col {j}" for j in range(GRID_WIDTH)]))
    for i in range(GRID_HEIGHT):
        print(f"Row {i} ", end='')
        for j in range(GRID_WIDTH):
            print(f"{V[i, j]:7.1f}", end='')
        print()


if __name__ == "__main__":
    main()


                  v_*
       Col 0  Col 1  Col 2  Col 3  Col 4
Row 0    22.0   24.4   22.0   19.4   17.5
Row 1    19.8   22.0   19.8   17.8   16.0
Row 2    17.8   19.8   17.8   16.0   14.4
Row 3    16.0   17.8   16.0   14.4   13.0
Row 4    14.4   16.0   14.4   13.0   11.7
