In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0,'../../modules')

In [2]:
import numpy as np
from maze_problem import Maze

# Value Function Policies
Given the value function $U$ we want to construct the optimal policy to maximize the look-ahead. <br>
$$\pi(s) = \text{argmax}_a \bigg(R(s,a) + \lambda \sum_{s'} T(s'|s,a) U_k^\pi(s'))\bigg) $$
**Taking the example from 1**

In [3]:
world = np.array([['W','W','W','W','W'],
                  ['W','B','B','G','W'],
                  ['W','B','F','B','W'],
                  ['W','S','F','B','W'],
                  ['W','B','B','B','W'],
                  ['W','W','W','W','W']])

maze = Maze(world)

getting the value function from a bad policy:

In [4]:
bad_policy = ['R','D','L','R','D','L','R','U','L','U','U','L']
bad_policy_transition_matrix = maze.get_policy_matrix(bad_policy)
maze.make_animation(bad_policy_transition_matrix,20)

[[                  ] 
 [    R   D   L     ] 
 [    R   D   L     ] 
 [    R   U   L     ] 
 [    U   U   L     ] 
 [                  ]] policy map


In [5]:
def calculate_value_function(reward,lambda_discount,transition_matrix):
    # transpose included here because I index current state in columns, next state in rows.
    return np.linalg.inv(np.eye(len(reward))-lambda_discount*transition_matrix.T).dot(reward) 
reward = np.array([-1,-1,10,-1,-50,-1,-1,-50,-1,-1,-1,-1])
discount = 0.95
value_func = calculate_value_function(reward,discount,bad_policy_transition_matrix)
maze.show_on_map(value_func.round(1))

[[                              ] 
 [       -593.5 -628.0 -592.7   ] 
 [       -633.9 -688.6 -633.7   ] 
 [       -634.5 -688.7 -633.9   ] 
 [       -597.7 -628.8 -594.2   ] 
 [                              ]]


Calculate the best policy by taking the maximum:

In [6]:
left_value = value_func.dot(maze.left_transition_matrix)+reward
right_value = value_func.dot(maze.right_transition_matrix)+reward
up_value = value_func.dot(maze.up_transition_matrix)+reward
down_value = value_func.dot(maze.down_transition_matrix)+reward
all_decisions = np.concatenate([left_value.reshape(-1,1),
                                right_value.reshape(-1,1),
                                up_value.reshape(-1,1),
                                down_value.reshape(-1,1)],axis=1)
best_policy_index = np.argmax(all_decisions,axis=1)
best_policy = [['L','R','U','D'][i] for i in best_policy_index]
maze.show_on_map(best_policy)

[[                  ] 
 [    U   R   L     ] 
 [    U   U   U     ] 
 [    D   D   D     ] 
 [    D   R   D     ] 
 [                  ]]


Showing the better policy:

In [7]:
better_policy_transition_matrix = maze.get_policy_matrix(best_policy)
maze.make_animation(better_policy_transition_matrix,20)

[[                  ] 
 [    U   R   L     ] 
 [    U   U   U     ] 
 [    D   D   D     ] 
 [    D   R   D     ] 
 [                  ]] policy map
