In [242]:
import numpy as np


In [243]:
# Constants
GRID_SIZE = 4
GAMMA = 0.9999999999 # Practical approximation of gamma = 1.0 to avoid floating-point precision issues and ensure smoother convergence
THETA = 1e-4
MAX_ITERATIONS = 1000
ACTIONS = ['up', 'down', 'left', 'right']
ACTION_PROBS = {a: 1.0 / len(ACTIONS) for a in ACTIONS}  # Equiprobable random policy
ITERATIONS_TO_SAVE = [0, 1, 2, 3, 10]


In [244]:
class Gridworld:
    """
    Represents the Gridworld environment.
    """
    def __init__(self, grid_size=GRID_SIZE):
        """
        Initialize the gridworld environment.
        """
        self.grid_size = grid_size
        self.states = [(i, j) for i in range(grid_size) for j in range(grid_size)]
        self.terminal_states = [(0, 0), (grid_size - 1, grid_size - 1)]
        self.actions = ACTIONS

    def step(self, state, action):
        """
        Given a state and an action, return the next state and reward.
        """
        if state in self.terminal_states:
            return state, 0  # No reward in terminal states

        i, j = state
        if action == 'up':
            next_state = (max(i - 1, 0), j)
        elif action == 'down':
            next_state = (min(i + 1, self.grid_size - 1), j)
        elif action == 'left':
            next_state = (i, max(j - 1, 0))
        elif action == 'right':
            next_state = (i, min(j + 1, self.grid_size - 1))
        else:
            raise ValueError("Invalid action.")

        reward = -1  # Standard reward for each move
        return next_state, reward


In [245]:
def create_random_policy(actions):
    """
    Create an equiprobable random policy.
    """
    action_probs = {action: 1.0 / len(actions) for action in actions}
    return action_probs


In [246]:
def evaluate_state(env, state, v, action_probs):
    """
    Evaluate the value of a state under the given policy.
    """
    if state in env.terminal_states:
        return 0  # Terminal states have a value of 0

    v_new = 0
    for action in env.actions:
        next_state, reward = env.step(state, action)
        prob = action_probs[action]
        v_new += prob * (reward + GAMMA * v[next_state])
    return v_new


In [247]:
def policy_evaluation(env, action_probs, iterations_to_save=ITERATIONS_TO_SAVE):
    """
    Perform iterative policy evaluation for the given policy.
    """
    grid_size = env.grid_size
    v = np.zeros((grid_size, grid_size))
    v_dict = {}
    iterations = 0

    if 0 in iterations_to_save:
        v_dict[0] = v.copy()

    while True:
        delta = 0
        v_old = v.copy()
        iterations += 1

        for state in env.states:
            v_new = evaluate_state(env, state, v_old, action_probs)
            delta = max(delta, abs(v_new - v[state]))
            v[state] = v_new

        if iterations in iterations_to_save:
            v_dict[iterations] = v.copy()

        if delta < THETA or iterations >= MAX_ITERATIONS:
            break  # Convergence achieved or max iterations reached

    v_dict['infty'] = v.copy()
    return v, v_dict


In [248]:
def format_and_print(v_k, iteration_label):
    """
    Format and print the value function with row and column labels.
    
    Parameters:
    - v_k: The value function at iteration k (2D numpy array).
    - iteration_label: A string representing the iteration (e.g., 'k = 0', 'k = ∞').
    """
    grid_size = v_k.shape[0]
    # Round to two significant digits
    v_rounded = np.round(v_k, decimals=1)
    v_significant = np.vectorize(lambda x: np.format_float_positional(x, precision=2, unique=False, fractional=False, trim='k'))
    
    # Convert the array to a string array with two significant digits
    v_str = v_significant(v_rounded.astype(float))
    
    print(f"\nv_k for the random policy at {iteration_label}:")
    
    # Print column headers
    col_headers = ['Col {}'.format(j) for j in range(grid_size)]
    print('       ' + '  '.join(['{:>5}'.format(ch) for ch in col_headers]))
    
    # Print each row with row labels
    for i in range(grid_size):
        row_values = '  '.join(['{:>5}'.format(v_str[i, j]) for j in range(grid_size)])
        print('Row {:<2}  {}'.format(i, row_values))


In [249]:
def main():
    """
    Main function to perform policy evaluation and print the value functions.
    """
    env = Gridworld()
    action_probs = create_random_policy(env.actions)
    v, v_dict = policy_evaluation(env, action_probs)

    # Print the value functions at specified iterations
    for k in ITERATIONS_TO_SAVE + ['infty']:
        iteration_label = f'k = {k}' if k != 'infty' else 'k = ∞'
        v_k = v_dict[k]
        format_and_print(v_k, iteration_label)

if __name__ == "__main__":
    main()



v_k for the random policy at k = 0:
       Col 0  Col 1  Col 2  Col 3
Row 0     0.0    0.0    0.0    0.0
Row 1     0.0    0.0    0.0    0.0
Row 2     0.0    0.0    0.0    0.0
Row 3     0.0    0.0    0.0    0.0

v_k for the random policy at k = 1:
       Col 0  Col 1  Col 2  Col 3
Row 0     0.0   -1.0   -1.0   -1.0
Row 1    -1.0   -1.0   -1.0   -1.0
Row 2    -1.0   -1.0   -1.0   -1.0
Row 3    -1.0   -1.0   -1.0    0.0

v_k for the random policy at k = 2:
       Col 0  Col 1  Col 2  Col 3
Row 0     0.0   -1.7   -2.0   -2.0
Row 1    -1.7   -2.0   -2.0   -2.0
Row 2    -2.0   -2.0   -2.0   -1.7
Row 3    -2.0   -2.0   -1.7    0.0

v_k for the random policy at k = 3:
       Col 0  Col 1  Col 2  Col 3
Row 0     0.0   -2.4   -2.9   -3.0
Row 1    -2.4   -2.9   -3.0   -2.9
Row 2    -2.9   -3.0   -2.9   -2.4
Row 3    -3.0   -2.9   -2.4    0.0

v_k for the random policy at k = 10:
       Col 0  Col 1  Col 2  Col 3
Row 0     0.0   -6.1   -8.4   -9.0
Row 1    -6.1   -7.7   -8.4   -8.4
Row 2    -8.4 