In [1]:
import numpy as np

# GridWorld
grid_size = [4, 4]

def get_state(state, action):
    action_grid = [(0, -1), (-1, 0), (0, 1), (1, 0)]
    
    state[0] += action_grid[action][0]
    state[1] += action_grid[action][1]
    
    if state[0] < 0 :
        state[0] = 0
        
    elif state[0] > 3 :
        state[0] = 3
    
    if state[1] < 0 :
        state[1] = 0
    
    elif state[1] > 3 :
        state[1] = 3
    
    return state[0], state[1]

In [2]:
# Envrionment
env = {}
env['action'] = [0, 1, 2, 3] # left, up, right, down
env['gamma'] = 1
env['reward'] = -1

In [3]:
# Agent
policy = np.empty([grid_size[0], grid_size[1], len(env['action'])], dtype=float)
for i in range(grid_size[0]):
    for j in range(grid_size[1]):
        for k in range(len(env['action'])):
            if i==j and i==3:
                policy[i][j]=0.00
            else :
                policy[i][j]=0.25
                
policy[3][3] = [0] * grid_size[1] # Goal

In [4]:
def evaluation(env, grid_size, policy):
    # initialize
    data = np.zeros([grid_size[0], grid_size[1]], dtype=float)
    while True:
        data_b = np.zeros([grid_size[0], grid_size[1]], dtype=float)
        for i in range(grid_size[0]):
            for j in range(grid_size[1]):
                if i==j and i==3:
                    value_t = 0
                else :
                    value_t = 0
                    for act in env['action']:
                        i_, j_ = get_state([i,j], act)
                        value = policy[i][j][act] * (env['reward'] + env['gamma'] * data[i_][j_])
                        value_t += value
                data_b[i][j] = round(value_t, 4)
                
        if (data - data_b).mean() < 1e-6:
            break
        else:
            data = data_b.copy()
        
    return data_b

In [5]:
def improvement(value, env, grid_size, policy):
    for i in range(grid_size[0]):
        for j in range(grid_size[1]):
            v_list=[]
            for k in range(len(env['action'])):
                i_, j_ = get_state([i, j], k)
                v_list.append(value[i_][j_])
                
            max_actions = [action_v for action_v, x in enumerate(v_list) if x == max(v_list)] 
            
            policy[i][j]= [0]*len(env['action']) # initialize
            for y in max_actions :
                policy[i][j][y] = (1 / len(max_actions))
    
    return policy

In [6]:
for _ in range(10):
    value = evaluation(env, grid_size, policy)
    updated_policy = improvement(value, env, grid_size, policy)

In [7]:
print(value)
print("")
print(updated_policy)

[[-6. -5. -4. -3.]
 [-5. -4. -3. -2.]
 [-4. -3. -2. -1.]
 [-3. -2. -1.  0.]]

[[[0.  0.  0.5 0.5]
  [0.  0.  0.5 0.5]
  [0.  0.  0.5 0.5]
  [0.  0.  0.  1. ]]

 [[0.  0.  0.5 0.5]
  [0.  0.  0.5 0.5]
  [0.  0.  0.5 0.5]
  [0.  0.  0.  1. ]]

 [[0.  0.  0.5 0.5]
  [0.  0.  0.5 0.5]
  [0.  0.  0.5 0.5]
  [0.  0.  0.  1. ]]

 [[0.  0.  1.  0. ]
  [0.  0.  1.  0. ]
  [0.  0.  1.  0. ]
  [0.  0.  0.5 0.5]]]
