In [7]:
import numpy as np


In [8]:
import numpy as np

ACTION_SPACE = ('U', 'D', 'L', 'R')

class Grid:  # Environment
    def __init__(self, rows, cols, start):
        self.rows = rows
        self.cols = cols
        self.i = start[0]
        self.j = start[1]

    def set_state(self, s):
        self.i = s[0]
        self.j = s[1]

    def current_state(self):
        return (self.i, self.j)

    def is_terminal(self, s):
        return s == (self.rows - 1, self.cols - 1)

    def move(self, action):
        if action in ACTION_SPACE:
            if action == 'U':
                self.i = max(0, self.i - 1)
            elif action == 'D':
                self.i = min(self.rows - 1, self.i + 1)
            elif action == 'L':
                self.j = max(0, self.j - 1)
            elif action == 'R':
                self.j = min(self.cols - 1, self.j + 1)
        return self.current_state()

    def all_states(self):
        return [(i, j) for i in range(self.rows) for j in range(self.cols)]

def print_values(V, g):
    for i in range(g.rows):
        print("---------------------------")
        for j in range(g.cols):
            v = V.get((i, j), 0)
            if v >= 0:
                print(f" {v:.2f} ", end=" ")
            else:
                print(f"{v:.2f} ", end=" ")  # negative sign takes an extra space
        print("")

def print_policy(P, g):
    for i in range(g.rows):
        print("---------------------------")
        for j in range(g.cols):
            a = P.get((i, j), ' ')
            print(f"  {a}  ", end=" ")
        print("")

def iterative_policy_evaluation(grid, policy, gamma=1.0, theta=1e-4):
    # Initialize V(s) = 0 for all states
    V = {}
    for s in grid.all_states():
        V[s] = 0

    while True:
        delta = 0
        for s in grid.all_states():
            if not grid.is_terminal(s):
                old_v = V[s]
                new_v = 0
                for a in ACTION_SPACE:
                    grid.set_state(s)
                    r = -1  # assuming a reward of -1 for each action
                    new_state = grid.move(a)
                    new_v += 0.25 * (r + gamma * V[new_state])  # assuming equal probability for all actions
                V[s] = new_v
                delta = max(delta, np.abs(old_v - V[s]))
        if delta < theta:
            break
    return V

if __name__ == "__main__":
    grid = Grid(5, 5, (0, 0))

    # Define a random policy where each action has an equal probability
    policy = {}
    for s in grid.all_states():
        policy[s] = np.random.choice(ACTION_SPACE)

    # Evaluate the policy
    V = iterative_policy_evaluation(grid, policy)

    # Print the value function
    print("Value function:")
    print_values(V, grid)

    # Print the policy
    print("Policy:")
    print_policy(policy, grid)


Value function:
---------------------------
-106.81  -104.81  -101.37  -97.62  -95.07  
---------------------------
-104.81  -102.25  -97.69  -92.40  -88.53  
---------------------------
-101.37  -97.69  -90.74  -81.78  -74.10  
---------------------------
-97.62  -92.40  -81.78  -65.89  -48.00  
---------------------------
-95.07  -88.53  -74.10  -48.00   0.00  
Policy:
---------------------------
  R     D     D     U     R   
---------------------------
  R     D     L     L     R   
---------------------------
  D     U     R     D     R   
---------------------------
  U     L     D     R     D   
---------------------------
  R     R     D     L     U   
