In [1]:
import numpy as np
import copy

In [2]:
# Initialize states and random policy

def initialization():
    '''
    Returns:
            V - a 2d array initialized as 0
            R - dictionary containing rewards for each state
            P - dictionary denoting equiprobable random policy
            states - array containing tuples of states in the gridworld
            terminal_states - array containing terminal states of the gridworld
    '''
    num_rows = 4
    num_cols = 4
    states = []
    for i in range(num_rows):
        for j in range(num_cols):
            states.append((i,j))
    terminal_states = [(0,0),(3,3)]
    V = np.zeros([4,4])
    R = {}
    P = {}
    for state in states:
        if state in terminal_states:
            R[state] = 0
            P[state] = []
        else:
            R[state] = -1
            P[state] = ['L','R','D','U']
    return V,R,P,states,terminal_states

In [13]:
def get_state(action, state):
  if action == 'L':
    new_state = (state[0], state[1]-1)
  if action == 'R':
    new_state = (state[0], state[1]+1)
  if action == 'U':
    new_state = (state[0]-1, state[1])
  if action == 'D':
    new_state = (state[0]+1, state[1])
  if new_state not in states:
    new_state = state
  return new_state

In [4]:
def get_array(V, R, P, state):
  actions = P[state]
  V_array = []
  for action in actions:
    new_state = get_state(action, state)
    V_state = R[state] + V[new_state]
    V_array.append(round(V_state, 2))
  return V_array

def get_indices(arr, max_):
  indices = []
  for x in range(len(arr)):
    if arr[x] == max_:
      indices.append(x)
  return indices

In [12]:
# Function to implement Value iteration
def value_iteration(V,P,R,states,terminal_states):
    '''
    Arguments:
            V - a 2d array initialized as 0
            R - dictionary containing rewards for each state
            P - dictionary denoting equiprobable random policy
            states - array containing tuples of states in the gridworld
            terminal_states - array containing terminal states of the gridworld
    Returns:
            P - optimal policy
    '''

    num_iterations = 0
    gamma = 1
    
    # Loop for convergence of value function
    while num_iterations < 2000:
      copyV = copy.deepcopy(V)
      for state in states:
        actions = P[state]
        V_array = []
        for action in actions:
          new_state = get_state(action, state)
          V_new = R[state] + (1/4) * gamma * V[new_state]
          V_array.append(V_new)
        max_V = 0
        if len(V_array):
          max_V = max(V_array)
        copyV[state] = max_V
      V = copyV
      num_iterations += 1

    # Find optimal policy using precomputed values of all states 
    for state in states:
      V_array = get_array(V, R, P, state)
      if len(V_array):
        max_ = max(V_array)
        indices = get_indices(V_array, max_)
        a = P[state]
        P[state] = [a[indice] for indice in indices] 
  

    return P

In [14]:
V,R,P,states,terminal_states = initialization()
P = value_iteration(V,P,R,states,terminal_states)

# Print optimal policy
# Each cell denotes the optimal action that needs to be taken in that state
print("Optimal policy is: ")
for row in range(4):
    for col in range(4):
        print(P[(row,col)],end=' ')
    print()

Optimal policy is: 
[] ['L'] ['L'] ['L', 'D'] 
['U'] ['L', 'U'] ['L', 'R', 'D', 'U'] ['D'] 
['U'] ['L', 'R', 'D', 'U'] ['R', 'D'] ['D'] 
['R', 'U'] ['R'] ['R'] [] 
