In [0]:
import numpy as np

In [0]:
#creating a grid environment
def gridEnv(curr_state,action):
  # In the 4*4 grid world state 0 and state 15 are terminal states
  if(curr_state==0 or curr_state==15):
    next_state=curr_state
    reward=0
  else:
    #action 0 --> UP, action 1 --> DOWN, action 2 --> RIGHT, action 3 --> LEFT
    #if the action results in the agent coming out of the grid then the curr_state will not change
    reward=-1
    if(action==0):
      next_state=curr_state-4
      if(next_state<0):
        next_state=curr_state
    elif(action==1):
      next_state=curr_state+4
      if(next_state>=16):
        next_state=curr_state
    elif(action==2):
      if((curr_state+1)%4==0):
        next_state=curr_state
      else:
        next_state=curr_state+1
    elif(action==3):
      if(curr_state%4==0):
        next_state=curr_state
      else:
        next_state=curr_state-1
#if the next_state is the terminanl state the reward is 0 otherwise -1
  #In this grid world the state transition probability for the given action and state is 1.
  items=[1,next_state,reward]

  return items
  

In [0]:
def polyEval(policy,gamma=1,theta=0.00000001):
  #Initializing the value function to zero
  vf=np.zeros(16)
  while True:
    delta=0
    #Iterating over the states
    for curr_state in range(16):
      new_vf=0
      #iterating over the policy of a given state
      for curr_action, p_action in enumerate(policy[curr_state]):
        stateTprob,next_state,reward=gridEnv(curr_state,curr_action)
        new_vf+=p_action*stateTprob*(reward+gamma*vf[next_state])
      #new_vf is calcuated using Bellman-equation
      #delta quantity is used to track the difference between initial VF and updated VF
      delta=max(delta,np.abs(new_vf-vf[curr_state]))
      vf[curr_state]=new_vf
    if(delta<theta):
      break
  #updated value function is returned
  return vf

In [0]:
def polyIteration(policy, gamma=0.9999):
    #Initilizing the current policy as opitmal policy
    policy_stable = True
    #Obtaining the value function from the current policy
    vf = polyEval(policy)
    #Iterating over all the states
    for curr_state in range(16):
        #b_action1 stores the optimal action for the current state as per the policy
        b_action1 = np.argmax(policy[curr_state])
        avp = np.zeros(4)
        #iterating over all the actions
        for curr_action in range(4):
            stateTprob, next_state, reward = gridEnv(curr_state, curr_action)
            avp[curr_action] = stateTprob*(reward+gamma*vf[next_state])
        #Bellman Optimality equation, taken greedily
        b_action2 = np.argmax(avp)
        #Updating the policy
        policy[curr_state] = np.eye(4)[b_action2]
        #If the updated best action is not same as the initial best action for a given state, the policy is still not optimal
        if(b_action1 != b_action2):
            policy_stable = False
    if(policy_stable):
        return policy,vf
    else:
        return polyIteration(policy)


In [0]:
#initializing random policy where each action has equal probability
policy = np.ones([16, 4])/4
policy,vf= polyIteration(policy)
print(f'Optimal Policy:\n\n{policy}\n')
print(f'Optimal Value function:\n\n {vf.reshape(4, 4)}')


Optimal Policy:

[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]]

Optimal Value function:

 [[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]
