In [None]:
gridworld = [
    ['_', 'A', '_', 'B', '_'],
    ['_', '_', '_', '_', '_'],
    ['_', '_', '_', 'B*', '_'],
    ['_', 'A*', '_', '_', '_'],
    ['_', '_', '_', '_', '_']
]

'''
X is horizontal, 0 to 4
Y is vertical, 0 to 4
'''

DISCOUNT = .9
CONVERGENCE_THRESHOLD = 1e-5


def search_gridworld(s):

    for y in range(len(gridworld)):
        for x in range(len(gridworld[0])):
            if gridworld[y][x] == s:
                return x, y


def eval_a_s(a, agent_X, agent_Y):

    if gridworld[agent_Y][agent_X] == 'A':

        new_X, new_Y = search_gridworld('A*')
        return new_X, new_Y, 10

    elif gridworld[agent_Y][agent_X] == 'B':

        new_X, new_Y = search_gridworld('B*')
        return new_X, new_Y, 5
    
    else:

        direction_map = {
            'N': (0, -1),  # Move North (decrease Y)
            'S': (0, 1),   # Move South (increase Y)
            'E': (1, 0),   # Move East (increase X)
            'W': (-1, 0),  # Move West (decrease X)
        }
        delta_X, delta_Y = direction_map[a]
        new_X = agent_X + delta_X
        new_Y = agent_Y + delta_Y

        if new_X < 0 or new_X >= len(gridworld) or new_Y < 0 or new_Y >= len(gridworld):
            return agent_X, agent_Y, -1
        else:
            return new_X, new_Y, 0
    

def update_state_values(v):

    new_v = [
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]
    ]
    actions = ['N', 'S', 'E', 'W']

    for y in range(len(v)):
        for x in range(len(v[0])):
            new_value = 0
            for a in actions:
                new_x, new_y, r = eval_a_s(a, x, y)
                new_value += (r + (DISCOUNT * v[new_y][new_x]))
            # All actions are of equal probability
            new_value /= len(actions)
            new_v[y][x] = new_value

    return new_v

In [None]:
v = [
    [0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0]
]

while True:
    new_v = update_state_values(v)
    max_delta = float('-inf')
    for y in range(len(v)):
        for x in range(len(v[0])):
            max_delta = max(max_delta, abs(new_v[y][x] - v[y][x]))
    if max_delta < CONVERGENCE_THRESHOLD:
        break
    v = new_v

print('v_pi(s):')
for r in v:
    line = ''
    for c in r:
        line += '{:.2f}'.format(c) + '\t'
    print(line)

print()

print('Gridworld:')
for r in gridworld:
    line = ''
    for c in r:
        line += c + '\t'
    print(line)

v_pi(s):
3.79	9.68	4.76	5.37	1.53	
1.80	3.33	2.44	1.99	0.59	
0.20	0.89	0.78	0.42	-0.36	
-0.89	-0.36	-0.30	-0.55	-1.15	
-1.80	-1.30	-1.19	-1.39	-1.95	

Gridworld:
_	A	_	B	_	
_	_	_	_	_	
_	_	_	B*	_	
_	A*	_	_	_	
_	_	_	_	_	


In [None]:
def update_state_values_optimal(v):

    new_v = [
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]
    ]
    best_actions = [
        ['', '', '', '', ''],
        ['', '', '', '', ''],
        ['', '', '', '', ''],
        ['', '', '', '', ''],
        ['', '', '', '', '']
    ]
    actions = ['N', 'S', 'E', 'W']

    for y in range(len(v)):
        for x in range(len(v[0])):
            max_value = float('-inf')
            best_a = []
            for a in actions:
                new_x, new_y, r = eval_a_s(a, x, y)
                value = r + (DISCOUNT * v[new_y][new_x])
                if value > max_value:
                    max_value = value
                    best_a = [a]
                elif value == max_value:
                    best_a.append(a)
            # Only select best action(s)
            new_v[y][x] = max_value
            best_actions[y][x] = ''.join(best_a)

    return new_v, best_actions

In [None]:
v = [
    [0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0]
]

# Stopping Condition: When best actions squared difference per epsiode < CONVERGENCE THRESHOLD
while True:
    new_v, best_actions = update_state_values_optimal(v)
    max_delta = float('-inf')
    for y in range(len(v)):
        for x in range(len(v[0])):
            max_delta = max(max_delta, abs(new_v[y][x] - v[y][x]))
    if max_delta < CONVERGENCE_THRESHOLD:
        break
    v = new_v

# Stopping Condition: When best actions don't change from previous episode's best actions
'''
best_actions = [[], [], [], [], []]

while True:
    new_v, new_best_actions = update_state_values_optimal(v)
    actions_eq_row_count = 0
    for a in range(len(best_actions)):
        if best_actions[a] == new_best_actions[a]:
            actions_eq_row_count += 1
    if actions_eq_row_count == len(best_actions):
        break
    v = new_v
    best_actions = new_best_actions
'''

print('v_pi*(s):')
for r in v:
    line = ''
    for c in r:
        line += '{:.2f}'.format(c) + '\t'
    print(line)

print()

print('Best Actions:')
for r in best_actions:
    line = ''
    for c in r:
        line += c + '\t'
    print(line)

print()

print('Gridworld:')
for r in gridworld:
    line = ''
    for c in r:
        line += c + '\t'
    print(line)

v_pi*(s):
26.17	29.08	26.17	22.17	19.95	
23.55	26.17	23.55	21.20	19.08	
21.20	23.55	21.20	19.08	17.17	
19.08	21.20	19.08	17.17	15.45	
17.17	19.08	17.17	15.45	13.91	

Best Actions:
E	NSEW	W	NSEW	W	
NE	N	NW	W	W	
NE	N	NW	NW	NW	
NE	N	NW	NW	NW	
NE	N	NW	NW	NW	

Gridworld:
_	A	_	B	_	
_	_	_	_	_	
_	_	_	B*	_	
_	A*	_	_	_	
_	_	_	_	_	
