In [8]:
import numpy as np
# Define the grid world environment
grid_size = 5
states = grid_size * grid_size  # Total number of states
actions = ["up", "down", "left", "right"]  # Possible actions

rewards = np.full((grid_size, grid_size), -1)  # Define rewards for each cell
rewards[0, 4] = 10  # Goal state
rewards[1, 1] = -100  # Obstacle or blocked state

In [9]:
# Define the transition probabilities for each action in each state
transitions = []
for _ in range(states):
    action_probs = []
    for _ in range(len(actions)):
        action_probs.append({"prob": 0.25})  # All actions have equal probability
    transitions.append(action_probs)

In [None]:
# Define the transition probabilities for each action in each state
transitions = []
for _ in range(states):
    action_probs = []
    for _ in range(len(actions)):
        action_probs.append({"prob": 0.25})  # All actions have equal probability
    transitions.append(action_probs)

In [10]:
# Define the initial value function
V = np.zeros((grid_size, grid_size))

In [11]:
# Value iteration
gamma = 0.9  # Discount factor
epsilon = 0.01  # Convergence threshold
delta = epsilon + 1  # Initialize delta to enter the loop
while delta > epsilon:
    delta = 0
    for i in range(grid_size):
        for j in range(grid_size):
            if (i == 0 and j == 4) or (i == 1 and j == 1):
                continue  # Skip goal state and blocked state
            v = V[i, j]
            state_values = []
            for action in actions:
                new_i, new_j = i, j
                if action == "up":
                    new_i -= 1
                elif action == "down":
                    new_i += 1
                elif action == "left":
                    new_j -= 1
                elif action == "right":
                    new_j += 1

                # Check if the new state is valid
                if new_i < 0 or new_i >= grid_size or new_j < 0 or new_j >= grid_size or (i == 1 and j == 1):
                    new_i, new_j = i, j  # Stay in the current state

                # Calculate the expected value
                value = 0
                for prob in transitions[i * grid_size + j][actions.index(action)]["prob"]:
                    reward = rewards[new_i, new_j]
                    value += prob * (reward + gamma * V[new_i, new_j])
                state_values.append(value)

            # Update the value function
            V[i, j] = max(state_values)

            # Update the delta
            delta = max(delta, abs(v - V[i, j]))

TypeError: 'float' object is not iterable

In [12]:
# Print the optimal policy
policy = np.empty((grid_size, grid_size), dtype=object)
for i in range(grid_size):
    for j in range(grid_size):
        if (i == 0 and j == 4) or (i == 1 and j == 1):
            policy[i, j] = "Goal/Blocked"
        else:
            state_values = []
            for action in actions:
                new_i, new_j = i, j
                if action == "up":
                    new_i -= 1
                elif action == "down":
                    new_i += 1
                elif action == "left":
                    new_j -= 1
                elif action == "right":
                    new_j += 1

                if new_i < 0 or new_i >= grid_size or new_j < 0 or new_j >= grid_size or (i == 1 and j == 1):
                    new_i, new_j = i, j  # Stay in the current state

                state_values.append(V[new_i, new_j])

            max_value = max(state_values)
            max_indices = [index for index, value in enumerate(state_values) if value == max_value]
            optimal_actions = [actions[index] for index in max_indices]
            policy[i, j] = optimal_actions

print("Optimal Policy:")
for row in policy:
    print(row)

Optimal Policy:
[list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right']) 'Goal/Blocked']
[list(['up', 'down', 'left', 'right']) 'Goal/Blocked'
 list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right'])]
[list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right'])]
[list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right'])]
[list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right'])
 list(['up', 'down', 'left', 'right'])]
