In [14]:
import numpy as np
import sys
# if "../" not in sys.path:
#   sys.path.append("../") 
from lib.envs.gridworld import GridworldEnv, Action

"""
UP = 0
RIGHT = 1
DOWN = 2
LEFT = 3
UPRIGHT = 4
DOWNRIGHT = 5
DOWNLEFT = 6
UPLEFT = 7
"""
UP = Action.UP.value
RIGHT = Action.RIGHT.value
DOWN = Action.DOWN.value
LEFT = Action.LEFT.value
UPRIGHT = Action.UPRIGHT.value
DOWNRIGHT = Action.DOWNRIGHT.value
DOWNLEFT = Action.DOWNLEFT.value
UPLEFT = Action.UPLEFT .value
print(UP, RIGHT, DOWN, LEFT, UPRIGHT, DOWNRIGHT, DOWNLEFT, UPLEFT)

0 1 2 3 4 5 6 7


In [47]:
env = GridworldEnv([6,6], actionmode=8) # actionmode = 4 or 8

## Value Evaluation Given Policy (Prediction Problem)

预测问题和控制问题如何联系起来？

In [51]:
def value_evaluation(env, policy, discount_factor = 1.0, theta = 1e-4):
    """
    Value Iteration Algorithm.
    
    Args:
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            env.nS is a number of states in the environment. 
            env.nA is a number of actions in the environment.
        policy: numpy 2D array object
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.
        
    Returns:
        A tuple (policy, V) of the optimal policy and the optimal value function.
    """
    
    def qValue(state, V):
        """
        Helper function to calculate the value for all action in a given state.
        
        Args:
            state: The state to consider (int)
            V: The value to use as an estimator, Vector of length env.nS
        
        Returns:
            A vector of length env.nA containing the expected value of each action.
        """
        Q = np.zeros(env.nA)
        for a in range(env.nA):
            for prob, next_state, reward, done in env.P[state][a]:
                Q[a] += prob * (reward + discount_factor * V[next_state])
        return Q
    
    V = np.zeros(env.nS)
    count = 0
    while True:
        count += 1
        # Stopping condition
        delta = 0
        # Update each state...
        for s in range(env.nS):
            # Do a one-step lookahead to find the best action
            Q = qValue(s, V)
            # Update the value function.
            new_value = 0
            for action, action_prob in enumerate(policy[s]):
                new_value += action_prob * Q[action]
            delta = max(delta, np.abs(new_value - V[s]))
            V[s] = new_value
        # Check if we can stop
        if delta < theta:
            break  
    return V, count

In [52]:
# value evaluation given a uniform random policy
uniform_random_policy = np.ones([env.nS, env.nA]) / env.nA
V_eval, count = value_evaluation(env, uniform_random_policy, 0.8)

print('count =',count)
print("Value Function:")
print(np.round(V_eval,3))

print("Reshaped Grid Value Function:")
print(np.round(V_eval.reshape(env.shape),3))


count = 29
Value Function:
[ 0.    -3.896 -4.619 -4.846 -4.92  -4.947 -3.896 -4.108 -4.652 -4.833
 -4.896 -4.92  -4.619 -4.652 -4.746 -4.817 -4.833 -4.846 -4.846 -4.833
 -4.817 -4.746 -4.652 -4.619 -4.92  -4.896 -4.833 -4.652 -4.108 -3.897
 -4.947 -4.92  -4.846 -4.619 -3.897  0.   ]
Reshaped Grid Value Function:
[[ 0.    -3.896 -4.619 -4.846 -4.92  -4.947]
 [-3.896 -4.108 -4.652 -4.833 -4.896 -4.92 ]
 [-4.619 -4.652 -4.746 -4.817 -4.833 -4.846]
 [-4.846 -4.833 -4.817 -4.746 -4.652 -4.619]
 [-4.92  -4.896 -4.833 -4.652 -4.108 -3.897]
 [-4.947 -4.92  -4.846 -4.619 -3.897  0.   ]]


## Value Function Iteration To Find Optimal Policy (Control Problem)

目前的版本感觉有些问题，在同一辅对角线上的策略应该是存在多个选择的。这个后续再研究一下。

In [53]:
def value_iteration(env, discount_factor = 1.0, theta = 1e-4):
    """
    Value Iteration Algorithm.
    
    Args:
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            env.nS is a number of states in the environment. 
            env.nA is a number of actions in the environment.
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.
        
    Returns:
        A tuple (policy, V) of the optimal policy and the optimal value function.
    """
    
    def one_step_lookahead(state, V):
        """
        Helper function to calculate the value for all action in a given state.
        
        Args:
            state: The state to consider (int)
            V: The value to use as an estimator, Vector of length env.nS
        
        Returns:
            A vector of length env.nA containing the expected value of each action.
        """
        A = np.zeros(env.nA)
        for a in range(env.nA):
            for prob, next_state, reward, done in env.P[state][a]:
                A[a] += prob * (reward + discount_factor * V[next_state])
        return A
    
    V = np.zeros(env.nS)
    count = 0
    while True:
        count += 1
        # Stopping condition
        delta = 0
        # Update each state...
        for s in range(env.nS):
            # Do a one-step lookahead to find the best action
            A = one_step_lookahead(s, V)
            best_action_value = np.max(A)
            # Calculate delta across all states seen so far
            delta = max(delta, np.abs(best_action_value - V[s]))
            # Update the value function. Ref: Sutton book eq. 4.10. 
            V[s] = best_action_value        
        # Check if we can stop 
        if delta < theta:
            break
    
    # Create a deterministic policy using the optimal value function
    policy = np.zeros([env.nS, env.nA])
    for s in range(env.nS):
        # One step lookahead to find the best action for this state
        A = one_step_lookahead(s, V)
        best_action = np.argmax(A)
        # Always take the best action
        policy[s, best_action] = 1.0
    
    return policy, V, count

In [54]:
policy_opt, v_opt, count = value_iteration(env, 1.0)
print("Action Set Size:", env.nA, ', count =', count)
print("Optimal Policy Probability Distribution (row: state, column: action probability):")
print(policy_opt)

if env.nA == 4:
    print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):")
else:
    print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left, 4=upright, 5=downright, 6=downleft, 7=upleft):")
print(np.reshape(np.argmax(policy_opt, axis=1), env.shape))

print("Value Function:")
print(v_opt)

print("Reshaped Grid Value Function:")
print(v_opt.reshape(env.shape))

Action Set Size: 8 , count = 6
Optimal Policy Probability Distribution (row: state, column: action probability):
[[1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 