In [1]:
import numpy as np
import sys
# if "../" not in sys.path:
#   sys.path.append("../") 
from lib.envs.gridworld import GridworldEnv, UP, RIGHT, DOWN, LEFT
"""
UP = 0
RIGHT = 1
DOWN = 2
LEFT = 3
"""

'\nUP = 0\nRIGHT = 1\nDOWN = 2\nLEFT = 3\n'

In [2]:
env = GridworldEnv([6,6])

## Value Evaluation Given Policy (Prediction Problem)

预测问题和控制问题如何联系起来？

In [7]:
def value_evaluation(env, policy, discount_factor = 1.0, theta = 1e-4):
    """
    Value Iteration Algorithm.
    
    Args:
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            env.nS is a number of states in the environment. 
            env.nA is a number of actions in the environment.
        policy: a dict object, e.g. {UP:0.25, DOWN: 0.25, LEFT: 0.25, RIGHT: 0.25}.
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.
        
    Returns:
        A tuple (policy, V) of the optimal policy and the optimal value function.
    """
    
    def qValue(state, V):
        """
        Helper function to calculate the value for all action in a given state.
        
        Args:
            state: The state to consider (int)
            V: The value to use as an estimator, Vector of length env.nS
        
        Returns:
            A vector of length env.nA containing the expected value of each action.
        """
        Q = np.zeros(env.nA)
        for a in range(env.nA):
            for prob, next_state, reward, done in env.P[state][a]:
                Q[a] += prob * (reward + discount_factor * V[next_state])
        return Q
    
    V = np.zeros(env.nS)
    while True:
        # Stopping condition
        delta = 0
        # Update each state...
        for s in range(env.nS):
            # Do a one-step lookahead to find the best action
            Q = qValue(s, V)
            # Update the value function.
            new_value = 0
            for action in range(env.nA):
                new_value += policy[action] * Q[action]
            delta = max(delta, np.abs(new_value - V[s]))
            V[s] = new_value
        # Check if we can stop 
        if delta < theta:
            break  
    return V

In [31]:
# value evaluation given a uniform random policy
uniform_random_policy = {UP:0.25, DOWN: 0.25, LEFT: 0.25, RIGHT: 0.25}
V_eval = value_evaluation(env, uniform_random_policy, 0.8)

print("Value Function:")
print(V_eval)

print("Reshaped Grid Value Function:")
print(V_eval.reshape(env.shape))


Value Function:
[ 0.         -3.43127516 -4.48088509 -4.81270103 -4.9161484  -4.94404788
 -3.43127516 -4.24439464 -4.67976642 -4.85397723 -4.90805018 -4.91619958
 -4.48088509 -4.67976642 -4.81967489 -4.86946077 -4.85401309 -4.81278571
 -4.81270103 -4.85397723 -4.86946077 -4.81970547 -4.67982757 -4.48098567
 -4.9161484  -4.90805018 -4.85401309 -4.67982757 -4.24447134 -3.43136156
 -4.94404788 -4.91619958 -4.81278571 -4.48098567 -3.43136156  0.        ]
Reshaped Grid Value Function:
[[ 0.         -3.43127516 -4.48088509 -4.81270103 -4.9161484  -4.94404788]
 [-3.43127516 -4.24439464 -4.67976642 -4.85397723 -4.90805018 -4.91619958]
 [-4.48088509 -4.67976642 -4.81967489 -4.86946077 -4.85401309 -4.81278571]
 [-4.81270103 -4.85397723 -4.86946077 -4.81970547 -4.67982757 -4.48098567]
 [-4.9161484  -4.90805018 -4.85401309 -4.67982757 -4.24447134 -3.43136156]
 [-4.94404788 -4.91619958 -4.81278571 -4.48098567 -3.43136156  0.        ]]


## Value Function Iteration To Find Optimal Policy (Control Problem)

目前的版本感觉有些问题，在同一辅对角线上的策略应该是存在多个选择的。这个后续再研究一下。

In [13]:
def value_iteration(env, discount_factor = 1.0, theta = 1e-4):
    """
    Value Iteration Algorithm.
    
    Args:
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            env.nS is a number of states in the environment. 
            env.nA is a number of actions in the environment.
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.
        
    Returns:
        A tuple (policy, V) of the optimal policy and the optimal value function.
    """
    
    def one_step_lookahead(state, V):
        """
        Helper function to calculate the value for all action in a given state.
        
        Args:
            state: The state to consider (int)
            V: The value to use as an estimator, Vector of length env.nS
        
        Returns:
            A vector of length env.nA containing the expected value of each action.
        """
        A = np.zeros(env.nA)
        for a in range(env.nA):
            for prob, next_state, reward, done in env.P[state][a]:
                A[a] += prob * (reward + discount_factor * V[next_state])
        return A
    
    V = np.zeros(env.nS)
    while True:
        # Stopping condition
        delta = 0
        # Update each state...
        for s in range(env.nS):
            # Do a one-step lookahead to find the best action
            A = one_step_lookahead(s, V)
            best_action_value = np.max(A)
            # Calculate delta across all states seen so far
            delta = max(delta, np.abs(best_action_value - V[s]))
            # Update the value function. Ref: Sutton book eq. 4.10. 
            V[s] = best_action_value        
        # Check if we can stop 
        if delta < theta:
            break
    
    # Create a deterministic policy using the optimal value function
    policy = np.zeros([env.nS, env.nA])
    for s in range(env.nS):
        # One step lookahead to find the best action for this state
        A = one_step_lookahead(s, V)
        best_action = np.argmax(A)
        # Always take the best action
        policy[s, best_action] = 1.0
    
    return policy, V

In [25]:
policy_opt, v_opt = value_iteration(env, 1.0)
print("Optimal Policy Probability Distribution (row: state, column: action probability):")
print(policy_opt)

print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):")
print(np.reshape(np.argmax(policy_opt, axis=1), env.shape))

print("Value Function:")
print(v_opt)

print("Reshaped Grid Value Function:")
print(v_opt.reshape(env.shape))

Optimal Policy Probability Distribution (row: state, column: action probability):
[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]
Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):
[[0 3 3 3 3 2]
 [0 0 0 0 0 2]
 [0 0 0 0 1 2]
 [0 0 0 1 1 2]
 [0 0 1 1 1 2]
 [0 1 1 1 1 0]]
Value Function:
[ 0. -1. -2. -3. -4. -5. -1. -2. -3. -4. -5. -4. -2. -3. -4. -5. -4. -3.
 -3. -4. -5. -4. -3. -2. -4. -5. -4. -3. -2. -1. -5. -4. -3. -2. -1.  0.]
Reshaped Grid Value Function:
[[ 0. -1. -2. -3. -4. -5.]
 [-1. -2. -3

In [32]:
env.render()

NotImplementedError: 