# Markov Decision Process

In [1]:
import torch
import itertools
import random

from envs import ObservableEnv, env_gridworld
import rl.mdp as mdp

## Slides Example

### Make Environment as torch tensor

* Reward:
    A 2D array of the rewards of $[s, a]$

* Transition:
    A 3D array of the probability of reaching state $s'$ under state $s$ and taking aciton $a$
    
* Value:
    A 1D array of the expected value at state $s$

In [2]:
states = ['PU', 'PF', 'RU', 'RF']
actions = ['S', 'A']
rewards = torch.tensor([[0, 0],
                        [0, 0],
                        [10, 10],
                        [10, 10]], dtype=torch.float32)
transitions = torch.tensor([[[1, 0, 0, 0], [0.5, 0.5, 0, 0]],
                            [[0.5, 0., 0., 0.5], [0., 1., 0., 0.]],
                            [[0.5, 0., 0.5, 0.], [0.5, 0.5, 0., 0.]],
                            [[0., 0., 0.5, 0.5], [0., 1., 0., 0.]]], dtype=torch.float32)
gamma = 0.9

In [3]:
env = ObservableEnv(states, actions, transitions, rewards)

### Value Iteration

In [4]:
value = torch.tensor([0, 0, 10, 10], dtype=torch.float32)

value_iteration = mdp.ValueIteration(env, value, gamma)
value_iteration.run()

Converged in 90 iterations.


(tensor([31.5824, 38.6013, 44.0214, 54.1988]), tensor([1, 0, 0, 0]))

### Policy Iteration

In [5]:
value = torch.tensor([0, 0, 10, 10], dtype=torch.float32)
policy = torch.tensor([random.randint(0, 1) for _ in range(4)])

policy_iteration = mdp.PolicyIteration(env, value, policy, gamma)
policy_iteration.run()

Converged in 2 iterations.


(tensor([ 2.0250,  4.5000, 16.5250, 21.0250]), tensor([1, 0, 0, 0]))

### Modified Policy Iteration

In [6]:
value = torch.tensor([0, 0, 10, 10], dtype=torch.float32)
policy = torch.tensor([random.randint(0, 1) for _ in range(4)])
n_evals = 5

modified_policy_iteration = mdp.ModifiedPolicyIteration(env, value, policy, gamma, n_evals)
modified_policy_iteration.run()

Converged in 19 iterations.


(tensor([31.5849, 38.6038, 44.0239, 54.2014]), tensor([1, 0, 0, 0]))

## Assigment 1 Part 1

### Make Environment

Grid world layout:

  \---------------------  
  |  0 |  1 |  2 |  3 |  
  \---------------------  
  |  4 |  5 |  6 |  7 |  
  \---------------------  
  |  8 |  9 | 10 | 11 |  
  \---------------------  
  | 12 | 13 | 14 | 15 |  
  \---------------------  

  Goal state: 15   
  Bad state: 9  
  End state: 16  
 
$|S| = 17$  
$|A| = 4$

transitions: $|S| * |A| * |S'|$  
rewards = $|S| * |A|$

In [7]:
states = [str(i) for i in range(17)]
actions = ['up', 'down', 'left', 'right']
transitions = env_gridworld.transtions
rewards = env_gridworld.rewards

gamma = 0.95

In [8]:
env = ObservableEnv(states, actions, transitions, rewards)

### Value Iteration

In [9]:
value = torch.empty(len(states), dtype=torch.float32)

value_iteration = mdp.ValueIteration(env, value, gamma)
value_iteration.run()

Converged in 23 iterations.


(tensor([ 39.6668,  46.4196,  54.0687,  61.7166,  40.8314,  47.9155,  62.9674,
          72.6333,  41.8038,  -6.6523,  73.7750,  85.3184,  55.0558,  65.7478,
          85.3184, 100.0000,   0.0000]),
 tensor([3, 3, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 3, 3, 3, 3, 3]))

### Policy Iteration

In [10]:
value = torch.empty(len(states), dtype=torch.float32)
policy = torch.tensor([random.randint(0, len(actions) - 1) for _ in range(len(states))], dtype=torch.int64)

policy_iteration = mdp.PolicyIteration(env, value, policy, gamma)
policy_iteration.run()

Converged in 9 iterations.


(tensor([4.9419e+01, 5.9734e+01, 6.8967e+01, 7.5514e+01, 5.1799e+01, 6.2376e+01,
         7.6428e+01, 8.3726e+01, 5.5216e+01, 6.5897e+00, 8.4529e+01, 9.1685e+01,
         6.7922e+01, 7.6410e+01, 9.1685e+01, 1.0000e+02, 1.1351e-43]),
 tensor([3, 3, 3, 1, 3, 3, 3, 1, 1, 3, 3, 1, 3, 3, 3, 3, 3]))

### Modified Policy Iteration

In [11]:
value = torch.empty(len(states))
policy = torch.tensor([random.randint(0, len(actions) - 1) for _ in range(len(states))])
n_evals = 5

modified_policy_iteration = mdp.ModifiedPolicyIteration(env, value, policy, gamma, n_evals)
modified_policy_iteration.run()

Converged in 7 iterations.


(tensor([6.0633e+01, 6.6039e+01, 7.1806e+01, 7.7093e+01, 5.9819e+01, 6.5185e+01,
         7.7832e+01, 8.4141e+01, 5.8096e+01, 7.9886e+00, 8.4867e+01, 9.1782e+01,
         6.9497e+01, 7.6810e+01, 9.1782e+01, 1.0000e+02, 1.2612e-44]),
 tensor([3, 3, 3, 1, 3, 3, 3, 1, 1, 3, 3, 1, 3, 3, 3, 3, 3]))