Import Necessary Libraries

In [26]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.table import Table

# Use a non-interactive backend for matplotlib
matplotlib.use('Agg')

Define the GridWorld Environment
We define the GridWorld class, which represents the environment in which the agent operates.

In [27]:
class GridWorld:
    def __init__(self, world_size=5, discount=0.9):
        # Initialize the grid world environment
        self.world_size = world_size
        self.discount = discount
        self.A_POS = [0, 1]
        self.A_PRIME_POS = [4, 1]
        self.B_POS = [0, 3]
        self.B_PRIME_POS = [2, 3]
        
        # Define possible actions: Left, Up, Right, Down
        self.ACTIONS = [np.array([0, -1]),
                        np.array([-1, 0]),
                        np.array([0, 1]),
                        np.array([1, 0])]
        
        # Generate all possible states in the grid
        self.state_space = [
            [i, j] for i in range(self.world_size) for j in range(self.world_size)
        ]
    
    def step(self, state, action):        
        # Special rules for positions A and B
        if state == self.A_POS:
            return self.A_PRIME_POS, 10
        if state == self.B_POS:
            return self.B_PRIME_POS, 5
        
        # Calculate the next state
        next_state = (np.array(state) + action).tolist()
        x, y = next_state
        
        # Check for boundary conditions
        if x < 0 or x >= self.world_size or y < 0 or y >= self.world_size:
            reward = -1.0
            next_state = state  # If out of bounds, stay in the same state
        else:
            reward = 0
        return next_state, reward
    
    def get_possible_actions(self):
        # Return the list of possible actions
        return self.ACTIONS
    
    def get_all_states(self):
        # Return the list of all possible states
        return self.state_space


Define the Value Iteration Agent
We define the ValueIterationAgent class, which performs the value iteration algorithm on the environment.

In [28]:
class ValueIterationAgent:
    def __init__(self, env, threshold=1e-4):
        # Initialize the agent with the environment and convergence threshold
        self.env = env
        self.threshold = threshold
        self.values = np.zeros((env.world_size, env.world_size))
    
    def value_iteration(self):
        # Perform the value iteration algorithm
        iteration = 0
        while True:
            delta = 0
            new_values = np.copy(self.values)
            for state in self.env.get_all_states():
                i, j = state
                max_value = self.compute_state_value(state)
                new_values[i, j] = max_value
                delta = max(delta, abs(self.values[i, j] - max_value))
            self.values = new_values
            iteration += 1
            if self.has_converged(delta):
                break
        return self.values
    
    def compute_state_value(self, state):
        # Compute the maximum value for a given state over all possible actions
        value_list = []
        for action in self.env.get_possible_actions():
            (next_i, next_j), reward = self.env.step(state, action)
            value = reward + self.env.discount * self.values[next_i, next_j]
            value_list.append(value)
        max_value = max(value_list)
        return max_value
    
    def has_converged(self, delta):
        # Check if the value iteration has converged
        return delta < self.threshold
    
    def get_optimal_values(self):
        # Return the optimal values after convergence
        return self.values


Define the Visualizer
We define the Visualizer class, which handles the visualization of the value function.

In [29]:
class Visualizer:
    def draw_image(values, filename='value_iteration_result.png'):
        # Visualize the value function as a table
        
        fig, ax = plt.subplots()
        ax.set_axis_off()
        tb = Table(ax, bbox=[0, 0, 1, 1])
        
        nrows, ncols = values.shape
        width, height = 1.0 / ncols, 1.0 / nrows
        
        # Add cells with the values
        for (i, j), val in np.ndenumerate(values):
            tb.add_cell(i, j, width, height, text=val,
                        loc='center', facecolor='white')
        
        # Add row and column labels
        for i in range(len(values)):
            tb.add_cell(i, -1, width, height, text=i + 1, loc='right',
                        edgecolor='none', facecolor='none')
            tb.add_cell(-1, i, width, height / 2, text=i + 1, loc='center',
                        edgecolor='none', facecolor='none')
        
        ax.add_table(tb)
        plt.savefig(filename)
        plt.close(fig)

Main Execution Block
We define the main function to tie everything together.

In [30]:
def main():
    # Initialize the environment
    env = GridWorld()
    
    # Initialize the agent and perform value iteration
    agent = ValueIterationAgent(env)
    optimal_values = agent.value_iteration()
    
    # Visualize the optimal values
    Visualizer.draw_image(np.round(optimal_values, decimals=1), 'figure_3_5.png')

main()