# Dynamic Programming

In [1]:
import numpy as np
import gym
import matplotlib.pyplot as plt
import random

## Helper Functions

In [2]:
def print_policy(pi, nS, n_cols,
                 name='Frozen Lake Random Strategy',
                 terminal_states=[5, 7, 11, 12, 15],
                 actions_mapping={0: '\u2190', 1: '\u2193', 2: '\u2192', 3: '\u2191'}):
    '''
    Prints a policy for a gridworld
    
    Args:
        pi:               policy
        nS:               number of states in the gridworld
        n_cols:           number of columns in the gridworld
        name:             name of the policy
        terminal_states:  list of all states that lead to itself with 100%
        actions_mapping:  list of symbols to print instead of integers
        
    Returns: 
        None
    '''
    print('\n')
    print('\033[1m' + name + '\033[0m')
    print('\n')
    for state in range(nS):
        end = '\n' if (state + 1) % n_cols == 0 else ' '
        
        if state in terminal_states:
            print('\u25A0'.rjust(10), end=end)
            continue
        
        action = pi(state)
        print(actions_mapping[action].rjust(10), end=end)

def print_state_value_func(V, n_cols):
    '''
    Prints a state value function of a grid world
    
    Args:
        V:       state value function of a grid world
        n_cols:  number of columns in the grid world
    Returns: 
        None
    '''
    
    for state in range(len(V)):
        end = '\n' if (state + 1) % n_cols == 0 else ' '
        value = V[state]
        print(f'{value:.5f}', end=end)

## Policy Iteration

In [3]:
# policy evaluation
def policy_evaluation(pi, env, gamma=1.0, delta=1e-10):
    '''
    Calculate the value function of a given policy pi
    
    Args: 
        pi:      policy to be evaluated (returns an action given a state)
        env:     openai gym environment
        gamma:   discount factor
        delta:   threshhold value to interrupt the policy evaluation
    
    Returns: 
        State value function V
    '''
    # P represents environment dynamics, including transition probabilities
    # P[state][action] = list of: probability of transition into next state, next state, reward, final state flag
    P = env.env.P 
    nS = env.observation_space.n
    
    # initialize a value function with 0
    V = np.zeros(nS, dtype=np.float64)
    V_old = V.copy()    
    while True:
        for state in range(nS):
            action = pi(state)
            v = 0
            for pr, next_state, reward, done in P[state][action]:
                v += pr * (reward + gamma * V[next_state] * (not done))
            V[state] = v
        
        max_diff = np.max(np.abs(V_old - V))
        if max_diff < delta:
            break
        
        V_old = V.copy()
    return V

In [4]:
# policy improvement
def policy_improvement(env, V, gamma=1.0):
    '''
    Improve an existing strategy by acting greedily
    
    Args: 
        env:     openai gym environment
        V:       state value function of a given policy
        gamma:   discount factor
    
    Returns: 
        pi:      a policy acting greedily using the value function of the current policy
    
    '''
    nS = env.observation_space.n
    nA = env.action_space.n
    P = env.env.P
    Q = np.zeros((nS, nA))
    
    for state in range(nS):
        for action in range(nA):
            for pr, next_state, reward, done in P[state][action]:
                Q[state][action] += pr * (reward + gamma * V[next_state] * (not done))
    
    
    greedy_strategy = {s:a for s, a in enumerate(np.argmax(Q, axis=1))}
    
    def pi(s):
        return greedy_strategy[s]
    
    return pi

In [28]:
# policy iteration
def policy_iteration(pi, env, gamma=1, delta=1e-10):
    '''
    Finds an optimal policy 
    
    Args: 
        pi:      starting policy
        env:     openai gym environment
        gamma:   discount factor
        delta:   threshhold value to interrupt the policy evaluation
    
    Returns: 
        Optimal policy and value function
    '''
    
    
    old_strategy = {s: pi(s) for s in range(env.observation_space.n)}
    
    while True:
        V = policy_evaluation(pi, env, gamma, delta)
        new_pi = policy_improvement(env, V, gamma)
        new_strategy = {s: new_pi(s) for s in range(env.observation_space.n)}
        
        if old_strategy == new_strategy:
            break
            
        old_strategy = new_strategy
        pi = new_pi
        
    return new_pi, policy_evaluation(new_pi, env, gamma, delta)

SyntaxError: invalid syntax (<ipython-input-28-96e45f172dea>, line 2)

## Value Iteration

In [84]:
def value_iteration(env, gamma=1, delta=1e-10):
    '''
    Finds an optimal policy
    
    Args: 
        env:     openai gym environment
        gamma:   discount factor
        delta:   threshhold value to interrupt the policy evaluation
    
    Returns: 
        Optimal policy and value function
    ''' 
    nS = env.observation_space.n
    nA = env.action_space.n
    V = np.zeros(nS, dtype=np.float64)
    P = env.env.P
    
    while True:
        V_old = V.copy()
        Q = np.zeros(shape=(nS, nA), dtype=np.float64)
        for state in range(nS):
            for action in range(nA):
                for pr, next_state, reward, done in P[state][action]:
                    Q[state][action] += pr * (reward + gamma * V[next_state] * (not done))
        V = np.max(Q, axis=1)
        max_diff = np.max(np.abs(V_old - V))
        if max_diff < delta:
            break

    strategy = {s: a for s, a in enumerate(np.argmax(Q, axis=1))}
    def pi(s):
        return strategy[s]
    
    return pi, V

## Random Policy

In [7]:
random.seed(42)

# random policy
def create_random_policy(nS, nA):
    
    '''
    Generates a random policy for a gridworld
    
    Args:
        nS: Number of states in a gridworld
        nA: Number of actions in a state
    
    Returns:
        random policy
    '''
    
    policy = {}
    for i in range(nS):
        policy[i] = random.randint(0, nA-1)
    
    def random_policy(s): 
        return policy[s]
    
    return random_policy

## Frozen Lake

In [8]:
env = gym.make('FrozenLake-v0')

In [9]:
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [10]:
random_policy = create_random_policy(env.observation_space.n, env.action_space.n)

In [11]:
V = policy_evaluation(random_policy, env)

In [12]:
print_policy(random_policy, 16, 4)



[1mFrozen Lake Random Strategy[0m


         ←          ←          →          ↓
         ↓          ■          ←          ■
         ↑          ←          ←          ■
         ■          ↓          ←          ■


The random policy performs rather poorly.

In [13]:
print_state_value_func(V, 4)

0.00000 0.00000 0.00000 0.00000
0.00000 0.00000 0.00000 0.00000
0.00000 0.00000 0.00000 0.00000
0.00000 0.00000 0.00000 0.00000


After a single step of policy improvement 

In [14]:
new_policy = policy_improvement(env, V)

In [15]:
V = policy_evaluation(new_policy, env)

In [16]:
print_policy(new_policy, 16, 4, name='Improved Policy')



[1mImproved Policy[0m


         ←          ←          ←          ←
         ←          ■          ←          ■
         ←          ←          ←          ■
         ■          ←          ↓          ■


In [17]:
print_state_value_func(V, 4)

0.00000 0.00000 0.03846 0.01923
0.00000 0.00000 0.07692 0.00000
0.00000 0.00000 0.19231 0.00000
0.00000 0.00000 0.50000 0.00000


In [33]:
pi, V = policy_iteration(random_policy, env, gamma=0.99)

In [34]:
print_policy(pi, 16, 4, name='Optimal Policy through policy iteration')



[1mOptimal Policy through policy iteration[0m


         ←          ↑          ↑          ↑
         ←          ■          ←          ■
         ↑          ↓          ←          ■
         ■          →          ↓          ■


In [35]:
print_state_value_func(V, 4)

0.54203 0.49880 0.47070 0.45685
0.55845 0.00000 0.35835 0.00000
0.59180 0.64308 0.61521 0.00000
0.00000 0.74172 0.86284 0.00000


In [36]:
pi, V = value_iteration(env, gamma=0.99)

In [37]:
print_policy(pi, 16, 4, name='Optimal Policy through value iteration')



[1mOptimal Policy through value iteration[0m


         ←          ↑          ↑          ↑
         ←          ■          ←          ■
         ↑          ↓          ←          ■
         ■          →          ↓          ■


In [38]:
print_state_value_func(V, 4)

0.54203 0.49880 0.47070 0.45685
0.55845 0.00000 0.35835 0.00000
0.59180 0.64308 0.61521 0.00000
0.00000 0.74172 0.86284 0.00000


## Taxi

In [63]:
env = gym.make('Taxi-v3')
env.render()

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |B: |
+---------+



In [85]:
pi, V = value_iteration(env, gamma=0.99)

In [92]:
# Play an episode
from IPython.display import clear_output
from time import sleep
obs, done = env.reset(), False
while not done:
    sleep(0.5)
    clear_output(wait=True)
    env.render()
    action = pi(obs)
    next_obs, reward, done, _ = env.step(action)
    obs = next_obs


+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[42mY[0m[0m| : |B: |
+---------+
  (South)
