In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0,'../../modules')

In [2]:
import numpy as np
from maze_problem import Maze

# Value Function Policies
Given the value function $U$ we want to construct the optimal policy to maximize the look-ahead. <br>
$$\pi(s) = \text{argmax}_a \bigg(R(s,a) + \lambda \sum_{s'} T(s'|s,a) U_k^\pi(s'))\bigg) $$
So, simply checking all actions and choosing the maximum expected utility is the right thing to do. <br>
**Taking the example from 1**

In [3]:
world = np.array([['W','W','W','W','W'],
                  ['W','B','B','G','W'],
                  ['W','B','F','B','W'],
                  ['W','S','F','B','W'],
                  ['W','B','B','B','W'],
                  ['W','W','W','W','W']])

maze = Maze(world)

getting the value function from a bad policy:

In [4]:
bad_policy = ['R','D','L','R','D','L','R','U','L','U','U','L']
bad_policy_transition_matrix = maze.get_policy_matrix(bad_policy)
maze.make_animation(bad_policy_transition_matrix,20)

In [5]:
def calculate_value_function(reward,lambda_discount,transition_matrix):
    # transpose included here because I index current state in columns, next state in rows.
    return np.linalg.inv(np.eye(len(reward))-lambda_discount*transition_matrix.T).dot(reward) 
reward = np.array([-1,-1,99,-1,-30,-1,-1,-30,-1,-1,-1,-1])
discount = 0.95
value_func = calculate_value_function(reward,discount,bad_policy_transition_matrix)
maze.show_on_map(value_func.round(1))

[[                              ] 
 [       -332.4 -348.7 -250.7   ] 
 [       -365.5 -398.0 -353.5   ] 
 [       -368.1 -399.3 -366.1   ] 
 [       -347.3 -365.2 -345.2   ] 
 [                              ]]


Calculate the best policy by taking the maximum:

In [6]:
left_value = value_func.dot(maze.left_transition_matrix)+reward
right_value = value_func.dot(maze.right_transition_matrix)+reward
up_value = value_func.dot(maze.up_transition_matrix)+reward
down_value = value_func.dot(maze.down_transition_matrix)+reward
all_decisions = np.concatenate([left_value.reshape(-1,1),
                                right_value.reshape(-1,1),
                                up_value.reshape(-1,1),
                                down_value.reshape(-1,1)],axis=1)
best_policy_index = np.argmax(all_decisions,axis=1)
best_policy = [['L','R','U','D'][i] for i in best_policy_index]
maze.show_on_map(best_policy)

[[                  ] 
 [    U   R   L     ] 
 [    U   U   U     ] 
 [    D   D   D     ] 
 [    D   R   D     ] 
 [                  ]]


Showing the better policy:

In [7]:
better_policy_transition_matrix = maze.get_policy_matrix(best_policy)
maze.make_animation(better_policy_transition_matrix,20)

This will also work for finite decisions. <br>
Here I use the same (good) policy as in nb 1 but change the value in the bottom middle to go up into the fire instead of down in the second to last step.

In [8]:
timestep1_policy = ["na","na","na","na","na","na","U","na","na","na","na","na"]
timestep2_policy = ["na","na","na","U","na","na","L","R","na","na","na","na"]
timestep3_policy = ["R","na","na","L","U","na","L","D","U","L","U","na"]
timestep4_policy = ["U","R","na","L","U","U","L","D","R","L","D","R"]
T1 = maze.get_policy_matrix(timestep1_policy)
T2 = maze.get_policy_matrix(timestep2_policy)
T3 = maze.get_policy_matrix(timestep3_policy)
T4 = maze.get_policy_matrix(timestep4_policy)
utility_4 = reward + T4.T.dot(reward)
utility_3 = reward + T3.T.dot(utility_4)
utility_2 = reward + T2.T.dot(utility_3)
utility_1 = reward + T1.T.dot(utility_2)
print("policy 3")
maze.show_on_map(timestep3_policy)

policy 3
[[                  ] 
 [    R   na  na    ] 
 [    L   U   na    ] 
 [    L   D   U     ] 
 [    L   U   na    ] 
 [                  ]]


In [9]:
def get_finite_best_policy(maze,utility):
    expected_return_left_1 = maze.left_transition_matrix.T.dot(utility)
    expected_return_right_1 = maze.right_transition_matrix.T.dot(utility)
    expected_return_up_1 = maze.up_transition_matrix.T.dot(utility)
    expected_return_down_1 = maze.down_transition_matrix.T.dot(utility)
    best_decision = np.argmax(np.concatenate([expected_return_left_1.reshape(-1,1),
                                              expected_return_right_1.reshape(-1,1),
                                              expected_return_up_1.reshape(-1,1),
                                              expected_return_down_1.reshape(-1,1)],axis=1),axis=1)
    new_policy = [['L','R','U','D'][d] for d in best_decision]
    return new_policy

new_policy_1 = get_finite_best_policy(maze,utility_2)
new_policy_1 = ["na" if timestep1_policy[d]=="na" else new_policy_1[d] for d in range(12)]
maze.show_on_map(new_policy_1)
new_policy_2 = get_finite_best_policy(maze,utility_3)
new_policy_2 = ["na" if timestep2_policy[d]=="na" else new_policy_2[d] for d in range(12)]
maze.show_on_map(new_policy_2)
new_policy_3 = get_finite_best_policy(maze,utility_4)
new_policy_3 = ["na" if timestep3_policy[d]=="na" else new_policy_3[d] for d in range(12)]
maze.show_on_map(new_policy_3)
new_policy_4 = get_finite_best_policy(maze,reward)
new_policy_4 = ["na" if timestep4_policy[d]=="na" else new_policy_4[d] for d in range(12)]
maze.show_on_map(new_policy_4)

[[                  ] 
 [    na  na  na    ] 
 [    na  na  na    ] 
 [    U   na  na    ] 
 [    na  na  na    ] 
 [                  ]]
[[                  ] 
 [    na  na  na    ] 
 [    U   na  na    ] 
 [    L   R   na    ] 
 [    na  na  na    ] 
 [                  ]]
[[                  ] 
 [    R   na  na    ] 
 [    L   U   na    ] 
 [    L   D   U     ] 
 [    L   D   na    ] 
 [                  ]]
[[                  ] 
 [    L   R   na    ] 
 [    L   U   U     ] 
 [    L   D   R     ] 
 [    L   D   L     ] 
 [                  ]]


Unsuprisingly the policy is changed so that value now points down. Some other values changed, but none that were important. E.g the top left it doesn't matter whether you go left or right at that point as both expected values are -1.