Import Necessary Libraries

In [8]:
import os
import numpy as np
import matplotlib
matplotlib.use('Agg')  # Use 'Agg' backend for environments without a display
import matplotlib.pyplot as plt
from matplotlib.table import Table

We define constants used throughout the code here. 
WORLD_SIZE sets the size of the grid world (4x4)
ACTIONS lists the possible actions as numpy arrays.
ACTION_PROB assigns equal probability to each action.
REWARD is the constant step cost for moving.
DISCOUNT is the discount factor for future rewards.
OUTPUT_DIR is the directory where images will be saved.

In [9]:
WORLD_SIZE = 4
ACTIONS = [
    np.array([0, -1]),  # Left
    np.array([-1, 0]),  # Up
    np.array([0, 1]),   # Right
    np.array([1, 0])    # Down
]
ACTION_PROB = 0.25  # Equal probability for each action
REWARD = -1         # Uniform step cost
DISCOUNT = 1.0      # Discount factor
OUTPUT_DIR = "state_value_tables"

We define the GridWorld class to represent the environment.
The init method initializes the grid size, reward, actions, and action probabilities.
The is_terminal method checks if a state is terminal.
The step method returns the next state and reward after taking an action.

In [10]:
class GridWorld:
    def __init__(self, size=WORLD_SIZE, reward=REWARD):
        self.size = size
        self.reward = reward
        self.actions = ACTIONS
        self.action_prob = ACTION_PROB

    def is_terminal(self, state):
        # Check if the state is a terminal state
        x, y = state
        return (x == 0 and y == 0) or (x == self.size - 1 and y == self.size - 1)

    def step(self, state, action):
        # Execute an action from the current state
        if self.is_terminal(state):
            return state, 0  # No movement and no reward in terminal states

        next_state = np.array(state) + action
        x, y = next_state

        # If next state is out of bounds, stay in the current state
        if x < 0 or x >= self.size or y < 0 or y >= self.size:
            next_state = state
        else:
            next_state = next_state.tolist()

        return next_state, self.reward


The PolicyEvaluator class handles the policy evaluation process.
The init method sets up the environment, discount factor, and initializes state values.
The evaluate_policy method iteratively updates state values until convergence.
The policy_evaluation_step method performs one iteration of updates.
The compute_state_value method calculates the value for a single state.

In [11]:
class PolicyEvaluator:
    def __init__(self, environment, discount=DISCOUNT):
        self.env = environment
        self.discount = discount
        self.state_values = np.zeros((self.env.size, self.env.size))

    def evaluate_policy(self, desired_iterations):
        # Perform iterative policy evaluation
        saved_values = {}
        iteration = 0

        if 0 in desired_iterations:
            saved_values[0] = self.state_values.copy()

        while True:
            delta = self.policy_evaluation_step()
            iteration += 1

            if iteration in desired_iterations:
                saved_values[iteration] = self.state_values.copy()

            if delta < 1e-4:
                saved_values[iteration] = self.state_values.copy()
                print(f'Converged after {iteration} iterations.')
                break

        return saved_values, iteration

    def policy_evaluation_step(self):
        # Perform a single iteration of policy evaluation
        old_state_values = self.state_values.copy()
        delta = 0

        for i in range(self.env.size):
            for j in range(self.env.size):
                if self.env.is_terminal([i, j]):
                    continue
                value = self.compute_state_value([i, j], old_state_values)
                delta = max(delta, abs(value - self.state_values[i, j]))
                self.state_values[i, j] = value
        return delta

    def compute_state_value(self, state, old_state_values):
        # Compute the value of a state
        value = 0
        for action in self.env.actions:
            next_state, reward = self.env.step(state, action)
            next_i, next_j = next_state
            value += self.env.action_prob * (reward + self.discount * old_state_values[next_i, next_j])
        return value


Define the Function to Draw the State Value Table

In [12]:
def draw_table(ax, state_values, iteration_label):
    ax.set_axis_off()
    table = Table(ax, bbox=[0, 0, 1, 1])

    nrows, ncols = state_values.shape
    width, height = 1.0 / ncols, 1.0 / nrows

    # Add cells with state values
    for (i, j), val in np.ndenumerate(state_values):
        table.add_cell(i, j, width, height, text=f'{val:.1f}',
                       loc='center', facecolor='white', edgecolor='black')

    ax.add_table(table)
    ax.set_title(f'k = {iteration_label}', fontsize=14)


Define the Function to Save the State Value Table as an Image

In [13]:
def save_table_image(state_values, iteration_label):
    fig, ax = plt.subplots(figsize=(4, 4))
    draw_table(ax, state_values, iteration_label)
    plt.tight_layout()

    # Create directory if it doesn't exist
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # Save the image to the folder
    image_path = os.path.join(OUTPUT_DIR, f'state_values_k_{iteration_label}.png')
    plt.savefig(image_path)
    plt.close()
    print(f'Saved state values table for k={iteration_label} to "{image_path}".')

Main Execution Flow

In [14]:
def main():
    desired_iterations = [0, 1, 2, 3, 10]

    # Initialize the environment and policy evaluator
    environment = GridWorld()
    evaluator = PolicyEvaluator(environment)

    # Perform policy evaluation
    saved_values, convergence_iteration = evaluator.evaluate_policy(desired_iterations)

    # Save images for each desired iteration
    for iteration in desired_iterations:
        save_table_image(saved_values[iteration], iteration)

    # Save the final convergence table
    save_table_image(saved_values[convergence_iteration], f'{convergence_iteration}_converged')

# Execute the main function
if __name__ == '__main__':
    main()

Converged after 173 iterations.
Saved state values table for k=0 to "state_value_tables\state_values_k_0.png".
Saved state values table for k=1 to "state_value_tables\state_values_k_1.png".
Saved state values table for k=2 to "state_value_tables\state_values_k_2.png".
Saved state values table for k=3 to "state_value_tables\state_values_k_3.png".
Saved state values table for k=10 to "state_value_tables\state_values_k_10.png".
Saved state values table for k=173_converged to "state_value_tables\state_values_k_173_converged.png".
