### Treasure hunter in 2D

In [23]:
import numpy as np
import pandas as pd
import time
import pprint
import copy
np.random.seed(794)

In [1]:
N_SIZE = 4
N_STATES = 16  # 4 x 4 2D world
STATE_index = [[i, j] for i in range(4) for j in range(4)]
ACTIONS = ['u', 'd', 'l', 'r']  # up, down, left, right
EPSILON = 0.9  # for greedy
ALPHA = 0.1  # LEARNING RATE
GAMMA = 0.9  # DISCOUNT FACTOR
MAX_EPISODES = 20
FRESH_TIME = 0.1
# P: you!, F: floor, W: Wall, T: treasure , 
# MAZE[0][0] is starting point, MAZE[3][3] is terminal
x0, y0 = 0, 0
MAZE = [  "FFFW", 
                 "FWFF",
                 "FFWW",
                 "FFFT"]
MAZE_LIST = []
for row in MAZE:
    l = [col for col in row]
    MAZE_LIST.append(l)
MAZE_LIST[1][0]

'F'

In [25]:
pprint.pprint(STATE_index)

[[0, 0],
 [0, 1],
 [0, 2],
 [0, 3],
 [1, 0],
 [1, 1],
 [1, 2],
 [1, 3],
 [2, 0],
 [2, 1],
 [2, 2],
 [2, 3],
 [3, 0],
 [3, 1],
 [3, 2],
 [3, 3]]


In [26]:
def build_Q_table(n_states, actions):
    table = pd.DataFrame(np.zeros((n_states, len(actions))), columns=actions)
    return table

def choose_action(state, q_table):
    state_actions = q_table.iloc[state,:]
    if (np.random.uniform() > EPSILON) or (state_actions.all()==0):
        action_name = np.random.choice(ACTIONS)
        # print(action_name)
        # print("random action {}".format(action_name))
    else:
        action_name = state_actions.argmax()
        # print("random greedy {}".format(action_name))
    return action_name

def move_check(x, y):
    if x > 3 or x < 0 or y > 3 or y < 0:  # illegal !
        return True
    else:
        pass

def get_env_feedback(S, A):
    # The argent can get rewards when Agent could arrive at Terminal.
    x,y = STATE_index[S]
    if A == 'u':
        x_, y_ = x - 1, y + 0  # UP
        if MAZE_LIST[x][y] == "T":
            S_ = 'terminal'
            R = 1
        elif move_check(x_, y_):
            S_ = STATE_index.index([x,y])
            R = -1
        elif MAZE_LIST[x_][y_] == "W": # OUCH
            S_ = STATE_index.index([x,y])
            R = -1
        else:
            S_ = STATE_index.index([x_,y_])
            R = 0
        return S_, R

    if A == 'd':
        x_, y_ = x + 1, y + 0  # DOWN
        if MAZE_LIST[x][y] == "T":
            S_ = 'terminal'
            R = 1
        elif move_check(x_, y_):
            S_ = STATE_index.index([x,y])
            R = -1
        elif MAZE_LIST[x_][y_] == "W": # OUCH
            S_ = STATE_index.index([x,y])
            R = -1
        else:
            S_ = STATE_index.index([x_,y_])
            R = 0
        return S_, R
 
    if A == 'l':
        x_, y_ = x + 0, y - 1  # LEFT
        if move_check(x_, y_):
            S_ = STATE_index.index([x,y])
            R = -1
        elif MAZE_LIST[x][y] == "T":
            S_ = 'terminal'
            R = 1
        elif MAZE_LIST[x_][y_] == "W": # OUCH
            S_ = STATE_index.index([x,y])
            R = -1
        else:
            S_ = STATE_index.index([x_,y_])
            R = 0
        return S_, R

    if A == 'r':
        x_, y_ = x + 0, y + 1  # RIGHT
        if move_check(x_, y_):
            S_ = STATE_index.index([x,y])
            R = -1
        elif MAZE_LIST[x][y] == "T":
            S_ = 'terminal'
            R = 1
        elif MAZE_LIST[x_][y_] == "W":  # OUCH
            S_ = STATE_index.index([x,y])
            R = -1
        else:
            S_ = STATE_index.index([x_,y_])
            R = 0
        return S_, R

def update_env(S, episode, step_counter):
    env = MAZE_LIST
    if S == 'terminal':
        interaction = 'Episode {}: total steps {}'.format(episode, step_counter)
        print('\r{}'.format(interaction), end=' ')
        time.sleep(2)
        print('\r                                          ', end=' ')
    else:
        new_env = copy.deepcopy(env)
        x, y = STATE_index[S]
        new_env[x][y] = "P"
        pprint.pprint("{}: steps".format(step_counter))
        pprint.pprint(new_env)
        time.sleep(FRESH_TIME)

In [27]:
def rlQlearn():
    q_table = build_Q_table(N_STATES, ACTIONS)
    for episode in range(MAX_EPISODES):
        step_counter = 0
        S = 0
        is_terminated = False
        update_env(S, episode, step_counter)
        while not is_terminated:
            A = choose_action(S, q_table)
            S_, R = get_env_feedback(S, A)
            q_predict = q_table.loc[S, A]
            if S_ != 'terminal':
                q_target = R + GAMMA * q_table.iloc[S_, :].max()
                #print(q_target)
            else:
                q_target = R
                is_terminated = True
            # print( q_target, q_predict, R)
            q_table.loc[S, A] += ALPHA * (q_target - q_predict)
            S = S_
            step_counter += 1
            update_env(S, episode, step_counter)
    return q_table

def rlSARSA():
    q_table = build_Q_table(N_STATES, ACTIONS)
    for episode in range(MAX_EPISODES):
        step_counter = 0
        S = 0
        is_terminated = False
        update_env(S, episode, step_counter)
        while not is_terminated:
            A = choose_action(S, q_table)
            S_, R = get_env_feedback(S, A)
            q_predict = q_table.loc[S, A]
            if S_ != 'terminal':
                A_ = choose_action(S, q_table)
                q_target = R + GAMMA * q_table.loc[S_, A_]
                #print(q_target)
            else:
                q_target = R
                is_terminated = True
            #print( q_target, q_predict, R)
            q_table.loc[S, A] += ALPHA * (q_target - q_predict)
            S = S_
            step_counter += 1
            update_env(S, episode, step_counter)
    return q_table    

In [28]:
q_table = rlQlearn()

'0: steps'
[['P', 'F', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'T']]
'1: steps'
[['F', 'F', 'F', 'W'],
 ['P', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'T']]
'2: steps'
[['P', 'F', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'T']]
'3: steps'
[['P', 'F', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'T']]
'4: steps'
[['F', 'P', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'T']]
'5: steps'
[['F', 'P', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'T']]
'6: steps'
[['P', 'F', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'T']]
'7: steps'
[['F', 'P', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'T']]
'8: steps'
[['F', 'P', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'T']]
'9: steps'
[['F', 'P', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W',

will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.


'16: steps'
[['F', 'F', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'P', 'F', 'T']]
'17: steps'
[['F', 'F', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'P', 'T']]
'18: steps'
[['F', 'F', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'P']]
                                           '0: steps'
[['P', 'F', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'T']]
'1: steps'
[['P', 'F', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'T']]
'2: steps'
[['F', 'P', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'T']]
'3: steps'
[['F', 'P', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'T']]
'4: steps'
[['P', 'F', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'T']]
'5: steps'
[['P', 'F', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'T']]
'6: steps'
[['P', 'F', 'F',

In [29]:
q_table

Unnamed: 0,u,d,l,r
0,-0.6861894,0.00106,-0.7941089,4.304672e-10
1,-0.8499054,-0.794109,9.087641e-09,3.874205e-11
2,-0.6513216,0.0,8.178877e-10,-0.8499054
3,0.0,0.0,0.0,0.0
4,4.782969e-09,0.004956,-0.7940923,-0.7940082
5,0.0,0.0,0.0,0.0
6,0.0,-0.833228,-0.7941089,0.0
7,-0.8499054,-0.71757,0.0,-0.814698
8,0.000194377,4.6e-05,-0.686164,0.01878593
9,-0.2709934,0.061444,0.0001900732,-0.271


In [31]:
q_table['location'] = STATE_index

In [35]:
for i, row in enumerate(q_table[['u','d','l','r']].idxmax(axis=1)):
    print( STATE_index[i], row )

[0, 0] d
[0, 1] l
[0, 2] l
[0, 3] u
[1, 0] d
[1, 1] u
[1, 2] u
[1, 3] l
[2, 0] r
[2, 1] d
[2, 2] u
[2, 3] u
[3, 0] r
[3, 1] r
[3, 2] r
[3, 3] d


In [15]:
MAZE_LIST

[['F', 'F', 'F', 'W'],
 ['F', 'W', 'F', 'F'],
 ['F', 'F', 'W', 'W'],
 ['F', 'F', 'F', 'T']]

In [34]:
q_table

Unnamed: 0,u,d,l,r,location
0,-0.6861894,0.00106,-0.7941089,4.304672e-10,"[0, 0]"
1,-0.8499054,-0.794109,9.087641e-09,3.874205e-11,"[0, 1]"
2,-0.6513216,0.0,8.178877e-10,-0.8499054,"[0, 2]"
3,0.0,0.0,0.0,0.0,"[0, 3]"
4,4.782969e-09,0.004956,-0.7940923,-0.7940082,"[1, 0]"
5,0.0,0.0,0.0,0.0,"[1, 1]"
6,0.0,-0.833228,-0.7941089,0.0,"[1, 2]"
7,-0.8499054,-0.71757,0.0,-0.814698,"[1, 3]"
8,0.000194377,4.6e-05,-0.686164,0.01878593,"[2, 0]"
9,-0.2709934,0.061444,0.0001900732,-0.271,"[2, 1]"
