Create updated gridworld environment

# Task 1: Updating the Reward Function
- In an MDP, the reward function is formally defined as a mapping

- Instead of pre-filling a reward matrix we update the code by implementing the rule directly into the environment to reflect the mathematical definition

- The reward is implemented dynamically based on state category instead of storing a fixed reward grid. This is done to make sure that there is a clean separation between environment structure and reward logic as well as easy modification if reward values change

In [None]:
import numpy as np

class GridWorld():
    def __init__(self, env_size):
        self.env_size = env_size

        # Define special states
        self.terminal_state = (4, 4)
        self.grey_states = [(2, 2), (3, 0), (0, 4)]

        # Reward values based on state category
        self.terminal_reward = 10
        self.grey_reward = -5
        self.regular_reward = -1

        # Define possible actions: Right, Left, Down, Up
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]
        self.action_description = ["Right", "Left", "Down", "Up"]

    def get_reward(self, i, j):
        """
        Returns reward based on the type of state.
        Implements R(s) according to assignment specification.
        """
        if (i, j) == self.terminal_state:
            return self.terminal_reward
        elif (i, j) in self.grey_states:
            return self.grey_reward
        else:
            return self.regular_reward

    def step(self, action_index, i, j):
        """
        Deterministic transition function.
        If action is invalid (off-grid), the agent remains in the same state.
        """
        action = self.actions[action_index]
        next_i, next_j = i + action[0], j + action[1]

        # Boundary check
        if not self.is_valid_state(next_i, next_j):
            next_i, next_j = i, j

        reward = self.get_reward(next_i, next_j)
        done = self.is_terminal_state(next_i, next_j)

        return next_i, next_j, reward, done

    def is_valid_state(self, i, j):
        return 0 <= i < self.env_size and 0 <= j < self.env_size

    def is_terminal_state(self, i, j):
        return (i, j) == self.terminal_state

    def get_size(self):
        return self.env_size

    def get_actions(self):
        return self.actions


# Task 1.2 Run the code
- Using the newly updated reward function from task 1 the same value iteration code from in class

- Since transitions are deterministic,

- We iterate until convergence and then extrac the optimal policy by selecting the action that maximizes the BellMan expression at each state

In [None]:
# Parameters
gamma = 0.9
theta = 1e-6

env = GridWorld(5)

# Initialize value table
V = np.zeros((env.get_size(), env.get_size()))

converged = False
iterations = 0

while not converged:
    delta = 0
    new_V = np.copy(V)

    for i in range(env.get_size()):
        for j in range(env.get_size()):

            # Skip terminal state
            if env.is_terminal_state(i, j):
                continue

            action_values = []

            for a in range(len(env.get_actions())):
                next_i, next_j, reward, done = env.step(a, i, j)
                action_value = reward + gamma * V[next_i, next_j]
                action_values.append(action_value)

            best_value = max(action_values)
            new_V[i, j] = best_value

            delta = max(delta, abs(V[i, j] - new_V[i, j]))

    V = new_V
    iterations += 1

    if delta < theta:
        converged = True

print("Converged in", iterations, "iterations")

# Extract optimal policy
policy = np.empty((env.get_size(), env.get_size()), dtype=object)

for i in range(env.get_size()):
    for j in range(env.get_size()):

        if env.is_terminal_state(i, j):
            policy[i, j] = "G"
            continue

        action_values = []

        for a in range(len(env.get_actions())):
            next_i, next_j, reward, done = env.step(a, i, j)
            action_value = reward + gamma * V[next_i, next_j]
            action_values.append(action_value)

        best_action = np.argmax(action_values)
        policy[i, j] = env.action_description[best_action]

print("\nOptimal Value Function (V*):")
print(np.round(V, 2))

print("\nOptimal Policy (Ï€*):")
print(policy)
