In [86]:
import numpy as np
import pprint
import sys
if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.gridworld import GridworldEnv

In [87]:
pp = pprint.PrettyPrinter(indent=2)
env = GridworldEnv()

![Gridworld-from-Book](../images/DP-PE-Gridworld.png)

In [88]:
# Taken from Policy Evaluation Exercise!
# Check 'Policy Evaluation Solution with Explanation.ipynb'for a more detailed explanation

def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):
    """
    Evaluate a policy given an environment and a full description of the environment's dynamics.
    
    Args:
        policy: [S, A] shaped matrix representing the policy.
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[s][a] is a (prob, next_state, reward, done) tuple.
        theta: We stop evaluation one our value function change is less than theta for all states.
        discount_factor: lambda discount factor.
    
    Returns:
        Vector of length env.nS representing the value function.
    """
    # Start with a random (all 0) value function
    V = np.zeros(env.nS)
    while True:
        delta = 0
        # For each state, perform a "full backup"
        for s in range(env.nS):
            v = 0
            # Look at the possible next actions
            for a, action_prob in enumerate(policy[s]):
                # For each action, look at the possible next states...
                for  prob, next_state, reward, done in env.P[s][a]:
                    # Calculate the expected value
                    v += action_prob * prob * (reward + discount_factor * V[next_state])
            # How much our value function changed (across any states)
            delta = max(delta, np.abs(v - V[s]))
            V[s] = v
        # Stop evaluating once our value function change is below a threshold
        if delta < theta:
            break
    return np.array(V)

![Gridworld-from-Book](../images/DP-Policy-Iteration.png)

In [None]:
def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):
    """
    Policy Improvement Algorithm. Iteratively evaluates and improves a policy
    until an optimal policy is found.
    
    Args:
        env: The OpenAI envrionment.
        policy_eval_fn: Policy Evaluation function that takes 3 arguments:
            policy, env, discount_factor.
        discount_factor: Lambda discount factor.
        
    Returns:
        A tuple (policy, V). 
        policy is the optimal policy, a matrix of shape [S, A] where each state s
        contains a valid probability distribution over actions.
        V is the value function for the optimal policy.
        
    """
    # Start with a random policy. All actions has same probability.
    # env.nS is number of states, in our case, 16 (4x4 grid)
    # env.nA is number of actions, in our case, 4
    # Our policy is saved as an array of probabilities for each state/grid position
    # So we have state 0, and we have 4 actions with 0.25 prob for each one [0.25 0.25 0.25 0.25]
    # That for every state on our environment, in our case, 16 states, 4 possible actions each
    # In future exercises, we will find ways of storing this information in a more compact way
    policy = np.ones([env.nS, env.nA]) / env.nA
    # we print it to see it more clear
    print("First Uniformly Random Policy Probability Distribution:")
    print(policy)
    print("")
    # we create a temporal one so we can see how it change later
    # it's not relevant for the algorithm itself
    one_step_old_policy = policy.copy()
    
    # Iteration counter
    i=0
    
    while True:
        # Evaluate the current policy
        # For more info, go to Policy Evaluation Solution with Explanation.ipynb
        V = policy_eval_fn(policy, env, discount_factor)
        
        # Will be set to false if we make any changes to the policy
        policy_stable = True
        
        # For each state (each state is a position on the grid)
        for s in range(env.nS):
            # The best action we would take under the currect policy at state s
            # argmax returns the position of the array with max probability
            # Coincidentally, our actions use the same values (0 to 3)
            chosen_a = np.argmax(policy[s])
            
            # Find the best action by one-step lookahead
            # Ties are resolved arbitarily
            # Array of size of # of actions, in our gridworld, [0 0 0 0]
            action_values = np.zeros(env.nA)
            for a in range(env.nA):
                for prob, next_state, reward, done in env.P[s][a]:
                    # we add all possible rewards over all possible outcomes if we take action a at state s
                    # we have complete knowledge of our environment. env.P is like our oracle. In future
                    # and more realistic problems, we'll not have that 
                    action_values[a] += prob * (reward + discount_factor * V[next_state])
                    # we do this for each possible action 0, 1, 2 or 3
            # and we choose the action which score is the greater
            best_a = np.argmax(action_values)
            
            # Greedily update the policy
            if chosen_a != best_a:
                # If at any state we choose an action different than the best one (calculated above)
                # this means that our policy is not the best one, so we need to keep improving it
                policy_stable = False
            
            # Now, our policy at state s will be taking the best action
            policy[s] = np.eye(env.nA)[best_a]
            # np.eye(env.nA) creates an identity matrix of size 4x4 (env.nA)
            # [1 0 0 0
            #  0 1 0 0
            #  0 0 1 0
            #  0 0 0 1]
            # when we add [best_a], we select the row number corresponding to the best action.
            # if best action is 2, we obtain [0 0 1 0], and we save that as our policy at that state
            # Remember: we save our policy as a table of possible actions at every state
            # Now we are being greedly, so we are saying that we are 100% sure that best action is the
            # best action for that state
            
            # We print a couple of steps of this to see it clearer
            # we print only when there are changes on the policy
            if not np.array_equal(one_step_old_policy[s],policy[s]):
                #if s>=0 and s<=4:
                print("Policy Probability Distribution at iteration",i,"state",s)
                print(policy)
                print("")
            # We use this temporal variable to print only changes on the policy
            one_step_old_policy[s] = policy[s]

        i+=1
        
        # If the policy is stable we've found an optimal policy. Return it
        if policy_stable:
            return policy, V

In [84]:
policy, v = policy_improvement(env)
print("Policy Probability Distribution:")
print(policy)
print("")

print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):")
print(np.reshape(np.argmax(policy, axis=1), env.shape))
print("")

print("Value Function:")
print(v)
print("")

print("Reshaped Grid Value Function:")
print(v.reshape(env.shape))
print("")



First Uniformly Random Policy Probability Distribution:
[[ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]]

Policy Probability Distribution at iteration 0 state 0
[[ 1.    0.    0.    0.  ]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]
 [ 0.25  0.25  0.25  0.25]]

Policy Probability Di

In [85]:
# Test the value function
expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1,  0])
np.testing.assert_array_almost_equal(v, expected_v, decimal=2)