# Markov Decision Process

In [8]:
import numpy as np
import itertools
import random

from envs import ObservableEnv, env_gridworld
import rl.mdp as mdp

## Slides Example

### Make Environment as Numpy Array

* Reward:
    A 2D array of the rewards of $[s, a]$

* Transition:
    A 3D array of the probability of reaching state $s'$ under state $s$ and taking aciton $a$
    
* Value:
    A 1D array of the expected value at state $s$

In [3]:
states = ['PU', 'PF', 'RU', 'RF']
actions = ['S', 'A']
rewards = np.array([[0, 0],
                    [0, 0],
                    [10, 10],
                    [10, 10]])
transitions = np.array([[[1, 0, 0, 0], [0.5, 0.5, 0, 0]],
                        [[0.5, 0., 0., 0.5], [0., 1., 0., 0.]],
                        [[0.5, 0., 0.5, 0.], [0.5, 0.5, 0., 0.]],
                        [[0., 0., 0.5, 0.5], [0., 1., 0., 0.]]])
gamma = 0.9

In [4]:
env = ObservableEnv(states, actions, transitions, rewards)

### Value Iteration

In [5]:
value = np.array([0, 0, 10, 10])

value_iteration = mdp.ValueIteration(env, value, gamma)
value_iteration.run()

Converged in 90 iterations.


(array([31.58236192, 38.60127399, 44.02143387, 54.19885637]),
 array([1, 0, 0, 0]))

### Policy Iteration

In [6]:
value = np.array([0, 0, 10, 10])
policy = np.array([random.randint(0, 1) for _ in range(4)])

policy_iteration = mdp.PolicyIteration(env, value, policy, gamma)
policy_iteration.run()

Converged in 2 iterations.


(array([ 2.025,  4.5  , 16.525, 21.025]), array([1, 0, 0, 0]))

### Modified Policy Iteration

In [7]:
value = np.array([0, 0, 10, 10])
policy = np.array([random.randint(0, 1) for _ in range(4)])
n_evals = 5

modified_policy_iteration = mdp.ModifiedPolicyIteration(env, value, policy, gamma, n_evals)
modified_policy_iteration.run()

Converged in 13 iterations.


(array([31.58334865, 38.60228564, 44.02239741, 54.19992339]),
 array([1, 0, 0, 0]))

## Assigment 1 Part 1

### Make Environment

Grid world layout:

  \---------------------  
  |  0 |  1 |  2 |  3 |  
  \---------------------  
  |  4 |  5 |  6 |  7 |  
  \---------------------  
  |  8 |  9 | 10 | 11 |  
  \---------------------  
  | 12 | 13 | 14 | 15 |  
  \---------------------  

  Goal state: 15   
  Bad state: 9  
  End state: 16  
 
$|S| = 17$  
$|A| = 4$

transitions: $|S| * |A| * |S'|$  
rewards = $|S| * |A|$

In [14]:
states = [str(i) for i in range(17)]
actions = ['up', 'down', 'left', 'right']
transitions = env_gridworld.transtions
rewards = env_gridworld.rewards

gamma = 0.95

In [15]:
env = ObservableEnv(states, actions, transitions, rewards)

### Value Iteration

In [21]:
value = np.zeros(len(states))

value_iteration = mdp.ValueIteration(env, value, gamma)
value_iteration.run()

Converged in 23 iterations.


(array([ 39.66673616,  46.41953402,  54.06871129,  61.7166396 ,
         40.83139756,  47.91550368,  62.96734881,  72.63330321,
         41.80375928,  -6.65234296,  73.77496298,  85.31840171,
         55.05578488,  65.74783919,  85.31840171, 100.        ,
          0.        ]),
 array([3, 3, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 3, 3, 3, 0, 0]))

### Policy Iteration

In [19]:
value = np.zeros(len(states))
policy = np.array([random.randint(0, len(actions) - 1) for _ in range(len(states))])

policy_iteration = mdp.PolicyIteration(env, value, policy, gamma)
policy_iteration.run()

Converged in 9 iterations.


(array([ 49.41906608,  59.73407408,  68.96677157,  75.51410453,
         51.79894022,  62.37622474,  76.42752305,  83.72598624,
         55.21588678,   6.58968597,  84.5293075 ,  91.68488264,
         67.92184379,  76.40994525,  91.68488264, 100.        ,
          0.        ]),
 array([3, 3, 3, 1, 3, 3, 3, 1, 1, 3, 3, 1, 3, 3, 3, 0, 0]))

### Modified Policy Iteration

In [23]:
value = np.zeros(len(states))
policy = np.array([random.randint(0, len(actions) - 1) for _ in range(len(states))])
n_evals = 5

modified_policy_iteration = mdp.ModifiedPolicyIteration(env, value, policy, gamma, n_evals)
modified_policy_iteration.run()

Converged in 4 iterations.


(array([ 60.63220183,  66.03886109,  71.80619988,  77.09294032,
         59.8192882 ,  65.18452884,  77.83150596,  84.14148782,
         58.09554418,   7.9886184 ,  84.86730342,  91.78165054,
         69.49679766,  76.80991419,  91.78165054, 100.        ,
          0.        ]),
 array([3, 3, 3, 1, 3, 3, 3, 1, 1, 3, 3, 1, 3, 3, 3, 0, 0]))