<center><h1> Dynamic Programming </h1></center>
<hr>

### Simple GridWorld implementation of Policy Evaluation, Policy Iteration and Value Iteration algorithms 
---

In [1]:
import numpy as np

### Environment

In [2]:
class Environment:
    grid = np.zeros((4,4))  # Initialise simple Gridworld
    reward = -1             # Reward at each step
    d_values = [-1,1,-1,1]  # ['up','down','left','right']

### Global variables

In [3]:
pi = np.ones((4,4,4))*0.25  # Random policy
env = Environment()

## Policy Evaluation

In [4]:
def pol_evaluation(env, pi):
    """
    Policy Evaluation algorithm
    
    Parameters passed - Environment object and Policy
    """
    
    theta = 1.0e-10  # Smaller the theta value, more precise the result
    gamma = 1        # Discount factor
    v = 0
    delta = 1        # Algorithm parameter
    
    while delta > theta:
        
        delta = 0
        grid_old = env.grid.copy()
        
        for row in range(0,4):
            for col in range(0,4):  # Iterate through all states
                
                if ((row,col) == (0,0)) or ((row,col) == (3,3)):  # Start state and terminal state
                    continue
                    
                v = env.grid[row,col]
                env.grid[row,col] = 0   # Later gets updated
                
                for next_a in range(0,2):  # Next action - up or down
                    
                    if row == 0 and next_a == 0:
                        env.grid[row,col] += pi[row,col,next_a]*(env.reward + gamma*grid_old[row,col])
                    elif row == 3 and next_a == 1:
                        env.grid[row,col] += pi[row,col,next_a]*(env.reward + gamma*grid_old[row,col])
                    else:
                        env.grid[row,col] += pi[row,col,next_a]*(env.reward + gamma*grid_old[row+env.d_values[next_a],col])
                 
                for next_a in range(2,4):  # Next action - left or right
                    
                    if col == 0 and next_a == 2:
                        env.grid[row,col] += pi[row,col,next_a]*(env.reward + gamma*grid_old[row,col])
                    elif col == 3 and next_a == 3:
                        env.grid[row,col] += pi[row,col,next_a]*(env.reward + gamma*grid_old[row,col])
                    else:
                        env.grid[row,col] += pi[row,col,next_a]*(env.reward + gamma*grid_old[row,col+env.d_values[next_a]])
                
                delta = max(delta, abs(v - env.grid[row,col]))
    
    return env.grid


In [5]:
print "Grid value function: \n\n", pol_evaluation(env, pi)

Grid value function: 

[[  0. -14. -20. -22.]
 [-14. -18. -20. -20.]
 [-20. -20. -18. -14.]
 [-22. -20. -14.   0.]]


## Policy Iteration

In [6]:
def pol_iteration(env, pi):
    """
    Policy Iteration algorithm
    
    Parameters passed - Environment object and Policy
    """
    
    gamma = 1
    
    while True:
        
        stable = True
        
        for row in range(0,4):
            for col in range(0,4):  # Iterate through all states
                    
                old_action = np.zeros(4)
                for j in range(0,4):
                    old_action[j] = pi[row,col,j] # Store current policy in `old_action`
                                                  # Policy gets updated later in the loop
                
                value = np.zeros(4)  # Array for action-values
                
                for next_a in range(0,2):  # Next action - up or down
                    
                    if row == 0 and next_a == 0:
                        value[next_a] = env.reward + gamma*env.grid[row,col]
                    elif row == 3 and next_a == 1:
                        value[next_a] = env.reward + gamma*env.grid[row,col]
                    else:
                        value[next_a] = env.reward + gamma*env.grid[row+env.d_values[next_a],col]
                 
                for next_a in range(2,4):  # Next action - left or right
                    
                    if col == 0 and next_a == 2:
                        value[next_a] = env.reward + gamma*env.grid[row,col]
                    elif col == 3 and next_a == 3:
                        value[next_a] = env.reward + gamma*env.grid[row,col]
                    else:
                        value[next_a] = env.reward + gamma*env.grid[row,col+env.d_values[next_a]]
                
                max_arg = np.max(value)  # Returns highest value of action
                pi[row,col,:] = 0        # Gets updated further
                
                for i in range(0,4):
                    if value[i] == max_arg:  # Returns index of best action
                        pi[row,col,i] += 1   # Sets policy value for best action as 1
                
                prob_d = np.sum(pi[row,col,:], axis=0)  # Checks for sum of policy values
                
                for i in range(0,4):
                    if pi[row,col,i] != 0:          # Possible that more than one action has same value
                        pi[row,col,i] = 1/prob_d    # Hence calculates probability of action taken
                         

                for j in range(0,4):
                    if(old_action[j] != pi[row,col,j]):  # Checks if old policy matches new policy
                        stable = False
                        break
                        
        if stable == False:
            env.grid = pol_evaluation(env,pi)
            continue
        else:
            break
            
    
    return env.grid, pi               

In [7]:
grid, pol = pol_iteration(env,pi)

print "Grid value function: \n\n", grid
print "\n\n Improved Policy: \n\n", pol

Grid value function: 

[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]


 Improved Policy: 

[[[0.5  0.   0.5  0.  ]
  [0.   0.   1.   0.  ]
  [0.   0.   1.   0.  ]
  [0.   0.5  0.5  0.  ]]

 [[1.   0.   0.   0.  ]
  [0.5  0.   0.5  0.  ]
  [0.25 0.25 0.25 0.25]
  [0.   1.   0.   0.  ]]

 [[1.   0.   0.   0.  ]
  [0.25 0.25 0.25 0.25]
  [0.   0.5  0.   0.5 ]
  [0.   1.   0.   0.  ]]

 [[0.5  0.   0.   0.5 ]
  [0.   0.   0.   1.  ]
  [0.   0.   0.   1.  ]
  [0.   0.5  0.   0.5 ]]]


## Value Iteration

In [8]:
def val_iteration(env, pi):
    """
    Value Iteration algorithm
    
    Parameters passed - Environment object and Policy
    """
    
    theta = 1.0e-10  # Smaller the theta value, more precise the result
    gamma = 1        # Discount factor
    v = 0
    delta = 1        # Algorithm parameter
    
    while delta > theta:
        delta = 0
        for row in range(0,4):
            for col in range(0,4):  # Iterate through all states
                
                if ((row,col) == (0,0)) or ((row,col) == (3,3)):  # Start state and terminal state
                    continue
                
                v = env.grid[row,col]
                grid = np.zeros(4)  # Array of all action-values
                
                for next_a in range(0,2):  # Next action - up or down
                    
                    if row == 0 and next_a == 0:
                        grid[next_a] = env.reward + gamma*env.grid[row,col]
                    elif row == 3 and next_a == 1:
                        grid[next_a] = env.reward + gamma*env.grid[row,col]
                    else:
                        grid[next_a] = env.reward + gamma*env.grid[row+env.d_values[next_a],col]
                 
                for next_a in range(2,4):  # Next action - left or right
                    
                    if col == 0 and next_a == 2:
                        grid[next_a] = env.reward + gamma*env.grid[row,col]
                    elif col == 3 and next_a == 3:
                        grid[next_a] = env.reward + gamma*env.grid[row,col]
                    else:
                        grid[next_a] = env.reward + gamma*env.grid[row,col+env.d_values[next_a]]
                        
                env.grid[row,col] = np.max(grid) # Assigning highest action-value to current state
                
                delta = max(delta, abs(v - env.grid[row,col]))
                
    """
    Algorithm to find optimal policy
    """
    
    for row in range(0,4):
        for col in range(0,4):  # Iterate through all states
                                  
            value = np.zeros(4)  # Array for action-values
                
            for next_a in range(0,2):  # Next action - up or down
                    
                if row == 0 and next_a == 0:
                    value[next_a] = env.reward + gamma*env.grid[row,col]
                elif row == 3 and next_a == 1:
                    value[next_a] = env.reward + gamma*env.grid[row,col]
                else:
                    value[next_a] = env.reward + gamma*env.grid[row+env.d_values[next_a],col]
                 
            for next_a in range(2,4):  # Next action - left or right
                    
                if col == 0 and next_a == 2:
                    value[next_a] = env.reward + gamma*env.grid[row,col]
                elif col == 3 and next_a == 3:
                    value[next_a] = env.reward + gamma*env.grid[row,col]
                else:
                    value[next_a] = env.reward + gamma*env.grid[row,col+env.d_values[next_a]]
                
            max_arg = np.argmax(value)  # Returns index of best action
            pi[row,col,:] = 0           
                
            pi[row,col,max_arg] = 1     # Deterministic policy to take best possible action                   
                
    return env.grid, pi
    

In [9]:
grid, policy = val_iteration(env,pi)

print "Final grid value function: \n\n", grid
print "\n\n Optimal Policy: \n\n", policy

Final grid value function: 

[[ 0. -1. -2. -3.]
 [-1. -2. -3. -2.]
 [-2. -3. -2. -1.]
 [-3. -2. -1.  0.]]


 Optimal Policy: 

[[[1. 0. 0. 0.]
  [0. 0. 1. 0.]
  [0. 0. 1. 0.]
  [0. 1. 0. 0.]]

 [[1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [0. 1. 0. 0.]]

 [[1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [0. 1. 0. 0.]
  [0. 1. 0. 0.]]

 [[1. 0. 0. 0.]
  [0. 0. 0. 1.]
  [0. 0. 0. 1.]
  [0. 1. 0. 0.]]]
