In [8]:
import numpy as np
import matplotlib.pyplot as plt
from grid_world import standard_grid, negative_grid

SMALL_ENOUGH = 10e-4

In [44]:
def print_values(V, g):
    for i in range(g.width):
        print('-'*28)
        row = ''
        for j in range(g.height):
            v = V.get((i, j), 0)
            if v >= 0:
                row += ' {:0.2f} |'.format(v)
            else:
                row += '{:0.2f} |'.format(v)
        print(row)
        
def print_policy(P, g):
    for i in range(g.width):
        print('-'*28)
        row = ''
        for j in range(g.height):
            a = P.get((i, j), ' ')
            row += '  {}   |'.format(a)
        print(row)

In [40]:
V = {}
grid = standard_grid()
S = grid.all_states()
for state in S:
    V[state] = 0
gamma = 1.0
while True:
    delta = 0
    for s in S:
        print('-'*60)
        old_v = V[s]
        print('OLD V = {}'.format(old_v))
        
        if s in grid.actions:
            new_v = 0
            p_a = 1.0 / len(grid.actions[s])
            for a in grid.actions[s]:
                print('Im in State {} and doing action {}'.format(s, a))
                grid.set_state(s)
                r = grid.move(a)
                new_v += p_a * (r + gamma * V[grid.current_state()])
                print('new_v += p_a * (r + gamma * V[grid.current_state()])')
                print('new_v += {} * ({} + {} * {})'.format(p_a, r, gamma, V[grid.current_state()]))
                print('I got reward {} and now im in state {}'.format(r, grid.current_state()))
            V[s] = new_v
            print('The new V[s]={}'.format(V[s]))
            delta = max(delta, np.abs(V[s] - old_v))
            print('The delta is {}'.format(delta))
        else:
            print('{} is a terminal state'.format(s))
    if delta < SMALL_ENOUGH: break
print('Values for uniform random actions:')
print_values(V, grid)
print('\n\n')


------------------------------------------------------------
OLD V = 0
Im in State (0, 1) and doing action L
new_v += p_a * (r + gamma * V[grid.current_state()])
new_v += 0.5 * (0 + 1.0 * 0)
I got reward 0 and now im in state (0, 0)
Im in State (0, 1) and doing action R
new_v += p_a * (r + gamma * V[grid.current_state()])
new_v += 0.5 * (0 + 1.0 * 0)
I got reward 0 and now im in state (0, 2)
The new V[s]=0.0
The delta is 0
------------------------------------------------------------
OLD V = 0
Im in State (1, 2) and doing action U
new_v += p_a * (r + gamma * V[grid.current_state()])
new_v += 0.3333333333333333 * (0 + 1.0 * 0)
I got reward 0 and now im in state (0, 2)
Im in State (1, 2) and doing action D
new_v += p_a * (r + gamma * V[grid.current_state()])
new_v += 0.3333333333333333 * (0 + 1.0 * 0)
I got reward 0 and now im in state (2, 2)
Im in State (1, 2) and doing action R
new_v += p_a * (r + gamma * V[grid.current_state()])
new_v += 0.3333333333333333 * (-1 + 1.0 * 0)
I got reward

Im in State (2, 3) and doing action U
new_v += p_a * (r + gamma * V[grid.current_state()])
new_v += 0.5 * (-1 + 1.0 * 0)
I got reward -1 and now im in state (1, 3)
The new V[s]=-0.7672869322369609
The delta is 0.002720164653231494
------------------------------------------------------------
OLD V = -0.5345738644739216
Im in State (2, 2) and doing action L
new_v += p_a * (r + gamma * V[grid.current_state()])
new_v += 0.3333333333333333 * (0 + 1.0 * -0.4030821563886059)
I got reward 0 and now im in state (2, 1)
Im in State (2, 2) and doing action R
new_v += p_a * (r + gamma * V[grid.current_state()])
new_v += 0.3333333333333333 * (0 + 1.0 * -0.7672869322369609)
I got reward 0 and now im in state (2, 3)
Im in State (2, 2) and doing action U
new_v += p_a * (r + gamma * V[grid.current_state()])
new_v += 0.3333333333333333 * (0 + 1.0 * -0.43741528931228213)
I got reward 0 and now im in state (1, 2)
The new V[s]=-0.5359281259792829
The delta is 0.002720164653231494
---------------------------

In [45]:

policy = {
    (2, 0): 'U',
    (1, 0): 'U',
    (0, 0): 'R',
    (0, 1): 'R',
    (0, 2): 'R',
    (1, 2): 'R',
    (2, 1): 'R',
    (2, 2): 'R',
    (2, 3): 'U'
}
print_policy(policy, grid)

V = {}
grid = standard_grid()
S = grid.all_states()
for state in S:
    V[state] = 0
gamma = 0.9
while True:
    delta = 0
    for s in S:
        old_v = V[s]
        
        if s in grid.actions:
            new_v = 0
            p_a = 1.0
            a = policy[s]
            grid.set_state(s)
            r = grid.move(a)
            V[s]= p_a * (r + gamma * V[grid.current_state()])
            delta = max(delta, np.abs(V[s] - old_v))
    if delta < SMALL_ENOUGH: break
print('Values for fixed policy actions:')
print_values(V, grid)
print('\n\n')

----------------------------
  R   |  R   |  R   |      |
----------------------------
  U   |      |  R   |      |
----------------------------
  U   |  R   |  R   |  U   |
Values for fixed policy actions:
----------------------------
 0.81 | 0.90 | 1.00 | 0.00 |
----------------------------
 0.73 | 0.00 |-1.00 | 0.00 |
----------------------------
 0.66 |-0.81 |-0.90 |-1.00 |





In [9]:
SMALL_ENOUGH

0.001

In [18]:
print_values(V, grid)

----------------------------
 0.00 | 0.00 | 0.00 | 0.00 |
----------------------------
 0.00 | 0.00 | 0.00 | 0.00 |
----------------------------
 0.00 | 0.00 | 0.00 | 0.00 |
