In [1]:
# Imports 
import numpy as np
from tqdm import trange

In [7]:
# Constantes 
GOAL_SCORE = 100
ALPHA = 0.1
GAMMA = 0.95
EPSILON = 0.8 
EPOCHS = 1000

# Variables
grid_rows = 3
grid_cols = 4

q_table = np.zeros((grid_rows, grid_cols, 4)) 

actions = ['up', 'down', 'left', 'right']

rewards = np.full((grid_rows, grid_cols), -1)
rewards[0, 3] = GOAL_SCORE # meta
obstacles = [[1,1]] # obstaculo

print(rewards)

[[ -1  -1  -1 100]
 [ -1  -1  -1  -1]
 [ -1  -1  -1  -1]]


In [5]:
# Funciones 
def is_terminal_state(row, col):
    if rewards[row, col] == GOAL_SCORE:
        return True
    return False

def get_random_location():
    row = np.random.randint(grid_rows)
    col = np.random.randint(grid_cols)
    while is_terminal_state(row, col) or [row, col] in obstacles:
        row = np.random.randint(grid_rows)
        col = np.random.randint(grid_cols)
    return row, col

def get_next_action(row, col):
    if np.random.random() < EPSILON:
        return np.argmax(q_table[row, col])
    return np.random.randint(4)

def get_next_state(row, col):
    finded = False
    while not finded:
        action_index = get_next_action(row, col)
        action = actions[action_index]
        new_row, new_col = row, col
        if action == 'up' and row > 0:
            new_row -= 1
        elif action == 'down' and row < grid_rows - 1:
            new_row += 1
        elif action == 'left' and col > 0:
            new_col -= 1
        elif action == 'right' and col < grid_cols - 1:
            new_col += 1

        if [new_row, new_col] not in obstacles and (new_row != row or new_col != col):
            finded = True
    return new_row, new_col, action_index

def get_shortest_path(start_row, start_col):
    if is_terminal_state(start_row, start_col):
        return []
    else:
        current_row, current_col = start_row, start_col
        path = [[current_row, current_col]]
        while not is_terminal_state(current_row, current_col):
            current_row, current_col, _ = get_next_state(current_row, current_col)
            path.append([current_row, current_col])
        return path

In [None]:
# Ejecucion

def train():
    for epoch in trange(EPOCHS):
        next_row, next_col = 2, 0
        while not is_terminal_state(next_row, next_col):
            old_row, old_col = next_row, next_col

            next_row, next_col, action_index = get_next_state(next_row, next_col)

            next_reward = rewards[next_row, next_col]
            old_q_value = q_table[old_row, old_col, action_index]

            temporal_difference = next_reward + (GAMMA * np.max(q_table[next_row, next_col])) - old_q_value

            q_table[old_row, old_col, action_index] = old_q_value + (ALPHA * temporal_difference)

train()
        

100%|██████████| 1000/1000 [00:00<00:00, 12799.45it/s]


In [10]:
# avr si jala esta vaina 
display(q_table)
get_shortest_path(2,0)

array([[[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]]])

[[2, 0], [1, 0], [0, 0], [0, 1], [0, 2], [0, 3]]