# Importing Libraries

In [23]:
from env import Grid, test_agent
import gymnasium as gym 
import numpy as np

# Initializing the Environment

In [24]:
envSize = 4
env = Grid(size=envSize,type="random")


In [25]:
info = env.reset()
print(f"The starting position is {info[0]}")
print(f"The ending position is {info[1]}")

The starting position is [0 0]
The ending position is [3 1]


# Defining the policy

In [26]:
policy_probs = np.full((envSize, envSize, 4), 0.25)
print(policy_probs)

[[[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]

 [[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]

 [[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]

 [[0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]
  [0.25 0.25 0.25 0.25]]]


In [27]:
def policy(current_pos):
    return policy_probs[current_pos]

#### Testing the policy with state (0,0)

In [28]:
action_probablities = policy((0,0))
for action, prob in zip(range(4), action_probablities):
    print(f"Probablity of taking action {action}: {prob}")

Probablity of taking action 0: 0.25
Probablity of taking action 1: 0.25
Probablity of taking action 2: 0.25
Probablity of taking action 3: 0.25


# Definig the Value table

In [29]:
state_values = np.zeros((envSize, envSize))
print(state_values)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


# Implementing the value iteration algorithm

In [30]:
def value_iteration(policy_probs, state_values, theta = 1e-6, gamma = 0.99):
    delta = float("inf")

    while delta > theta:
        delta = 0

        for row in range(envSize):
            for col in range(envSize):
                old_value = state_values[(row,col)]
                action_probs = None
                max_qsa = float("-inf")

                for action in range(4): 
                    next_state, reward, _,_ = env.simulate_step((row,col), action)
                    qsa = reward + gamma * state_values[next_state]

                    if qsa > max_qsa:
                        max_qsa = qsa
                        action_probs = np.zeros(4)
                        action_probs[action] = 1

                state_values[row][col] = max_qsa
                policy_probs[row][col] = action_probs
                delta = max(delta, abs(max_qsa - old_value))


In [31]:
value_iteration(policy_probs, state_values)

In [32]:
print(state_values)

[[96.05950237 97.02980137 96.05950335 95.09890832]
 [97.02980137 98.00990137 97.02980235 96.05950433]
 [98.00990137 98.99990137 98.00990235 97.02980333]
 [98.99990137 99.99990137 98.99990235 98.00990333]]


In [33]:
print(policy_probs)

[[[0. 1. 0. 0.]
  [0. 0. 1. 0.]
  [0. 0. 1. 0.]
  [0. 0. 1. 0.]]

 [[0. 1. 0. 0.]
  [0. 0. 1. 0.]
  [0. 0. 1. 0.]
  [0. 0. 1. 0.]]

 [[0. 1. 0. 0.]
  [0. 0. 1. 0.]
  [0. 0. 1. 0.]
  [0. 0. 1. 0.]]

 [[0. 1. 0. 0.]
  [0. 0. 1. 0.]
  [0. 0. 0. 1.]
  [0. 0. 0. 1.]]]
