# Markov Decision Process

In [1]:
import torch
import itertools
import random

from envs import ObservableEnv, env_gridworld
import rl.mdp as mdp

## Slides Example

### Make Environment as Numpy Array

* Reward:
    A 2D array of the rewards of $[s, a]$

* Transition:
    A 3D array of the probability of reaching state $s'$ under state $s$ and taking aciton $a$
    
* Value:
    A 1D array of the expected value at state $s$

In [2]:
states = ['PU', 'PF', 'RU', 'RF']
actions = ['S', 'A']
rewards = torch.tensor([[0, 0],
                        [0, 0],
                        [10, 10],
                        [10, 10]], dtype=torch.float32)
transitions = torch.tensor([[[1, 0, 0, 0], [0.5, 0.5, 0, 0]],
                            [[0.5, 0., 0., 0.5], [0., 1., 0., 0.]],
                            [[0.5, 0., 0.5, 0.], [0.5, 0.5, 0., 0.]],
                            [[0., 0., 0.5, 0.5], [0., 1., 0., 0.]]], dtype=torch.float32)
gamma = 0.9

In [3]:
env = ObservableEnv(states, actions, transitions, rewards)

### Value Iteration

In [4]:
value = torch.tensor([0, 0, 10, 10], dtype=torch.float32)

value_iteration = mdp.ValueIteration(env, value, gamma)
value_iteration.run()

Converged in 90 iterations.


(tensor([31.5824, 38.6013, 44.0214, 54.1988]), tensor([1, 0, 0, 0]))

### Policy Iteration

In [5]:
value = torch.tensor([0, 0, 10, 10], dtype=torch.float32)
policy = torch.tensor([random.randint(0, 1) for _ in range(4)])

policy_iteration = mdp.PolicyIteration(env, value, policy, gamma)
policy_iteration.run()

Converged in 2 iterations.


(tensor([ 0.0000,  4.5000, 14.5000, 19.0000]), tensor([1, 0, 0, 0]))

### Modified Policy Iteration

In [6]:
value = torch.tensor([0, 0, 10, 10], dtype=torch.float32)
policy = torch.tensor([random.randint(0, 1) for _ in range(4)])
n_evals = 5

modified_policy_iteration = mdp.ModifiedPolicyIteration(env, value, policy, gamma, n_evals)
modified_policy_iteration.run()

Converged in 13 iterations.


(tensor([31.5833, 38.6022, 44.0223, 54.1999]), tensor([1, 0, 0, 0]))

## Assigment 1 Part 1

### Make Environment

Grid world layout:

  \---------------------  
  |  0 |  1 |  2 |  3 |  
  \---------------------  
  |  4 |  5 |  6 |  7 |  
  \---------------------  
  |  8 |  9 | 10 | 11 |  
  \---------------------  
  | 12 | 13 | 14 | 15 |  
  \---------------------  

  Goal state: 15   
  Bad state: 9  
  End state: 16  
 
$|S| = 17$  
$|A| = 4$

transitions: $|S| * |A| * |S'|$  
rewards = $|S| * |A|$

In [7]:
states = [str(i) for i in range(17)]
actions = ['up', 'down', 'left', 'right']
transitions = env_gridworld.transtions
rewards = env_gridworld.rewards

gamma = 0.95

In [8]:
env = ObservableEnv(states, actions, transitions, rewards)

### Value Iteration

In [10]:
value = torch.empty(len(states), dtype=torch.float32)

value_iteration = mdp.ValueIteration(env, value, gamma)
value_iteration.run()

Converged in 23 iterations.


(tensor([ 39.6667,  46.4195,  54.0687,  61.7166,  40.8314,  47.9155,  62.9673,
          72.6333,  41.8038,  -6.6523,  73.7750,  85.3184,  55.0558,  65.7478,
          85.3184, 100.0000,   0.0000]),
 tensor([3, 3, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 3, 3, 3, 3, 3]))

### Policy Iteration

In [None]:
value = np.zeros(len(states))
policy = np.array([random.randint(0, len(actions) - 1) for _ in range(len(states))])

policy_iteration = mdp.PolicyIteration(env, value, policy, gamma)
policy_iteration.run()

### Modified Policy Iteration

In [None]:
value = np.zeros(len(states))
policy = np.array([random.randint(0, len(actions) - 1) for _ in range(len(states))])
n_evals = 5

modified_policy_iteration = mdp.ModifiedPolicyIteration(env, value, policy, gamma, n_evals)
modified_policy_iteration.run()