# Sub-task 2: Reinforcement Learning


In [None]:
import numpy as np

# Define the grid with utilities
grid_utilities = np.array([
    [7.41, 7.52, 7.65, 10, 7.54],
    [7.31, np.nan, -10, 5.82, -10],
    [7.15, np.nan, 4.31, np.nan, 6.12],
    [6.98, 6.77, 6.44, 5.87, np.nan],
    [6.90, 6.80, 6.59, 6.51, 6.34]
])

# Define the reward for non-terminal states
reward = -0.1

# Define the state transition probabilities
prob_success = 0.8
prob_fail = 0.1  # 0.2 probability of failure divided equally between two perpendicular directions

# Function to calculate the expected utility of a move
def expected_utility(grid, state, action):
    x, y = state
    if action == "UP":
        intended_state = (x - 1, y)
    elif action == "DOWN":
        intended_state = (x + 1, y)
    elif action == "LEFT":
        intended_state = (x, y - 1)
    elif action == "RIGHT":
        intended_state = (x, y + 1)
    
    # Check for invalid moves
    if intended_state[0] < 0 or intended_state[0] >= grid.shape[0] or \
       intended_state[1] < 0 or intended_state[1] >= grid.shape[1] or \
       np.isnan(grid[intended_state]):
        intended_utility = grid[state]
    else:
        intended_utility = grid[intended_state]
    
    # Calculate utilities for perpendicular moves
    fail_utilities = []
    for dx, dy in [(-1, 0), (1, 0), (0, -1), (0, 1)]:  # UP, DOWN, LEFT, RIGHT
        fail_state = (x + dx, y + dy)
        if fail_state == intended_state or \
           fail_state[0] < 0 or fail_state[0] >= grid.shape[0] or \
           fail_state[1] < 0 or fail_state[1] >= grid.shape[1] or \
           np.isnan(grid[fail_state]):
            fail_utilities.append(grid[state])
        else:
            fail_utilities.append(grid[fail_state])
    
    # Assuming perpendicular moves are LEFT and RIGHT or UP and DOWN
    perpendicular_utilities = (fail_utilities[2] + fail_utilities[3]) if action in ["UP", "DOWN"] \
                              else (fail_utilities[0] + fail_utilities[1])
    
    # Calculate the expected utility
    eu = (prob_success * intended_utility) + (prob_fail * perpendicular_utilities) + reward
    
    return eu

# The highlighted states are (1, 0), (4, 2), and (3, 2)
highlighted_states = [(1, 0), (4, 2), (3, 2)]

# Calculate the optimal policy for each highlighted state
optimal_policies = {}
for state in highlighted_states:
    actions = ["UP", "DOWN", "LEFT", "RIGHT"]
    utilities = {action: round(expected_utility(grid_utilities, state, action), 2) for action in actions}
    optimal_action = max(utilities, key=utilities.get)
    optimal_policies[state] = optimal_action

# Output the optimal policies and their expected utilities for the highlighted states
for state, action in optimal_policies.items():
    expected_utility_value = round(expected_utility(grid_utilities, state, action), 2)
    print(f"Optimal policy for state {state}: {action} with expected utility {expected_utility_value}")


Optimal policy for state (1, 0): UP with expected utility 7.29
Optimal policy for state (4, 2): LEFT with expected utility 6.64
Optimal policy for state (3, 2): DOWN with expected utility 6.44
