# Q-Learn para o mundo de grades

### Preparando a estrutura do mundo das grades

In [1]:
from tkinter import *

master = None
size = 100
player = (0, 1)
max_x = 3
max_y = 2
final = (2,0)
restart = False
board = None
me = None
r = 30
arrow_ids = []

def create_board():
    global board, me, master, r
    master = Tk()
    board = Canvas(master, width=3*size, height=2*size)

    for row in range(max_x):
        for column in range(max_y):
            board.create_rectangle(row*size, column*size, (row+1)*size, (column+1)*size, fill="white", width=1)  

    board.create_rectangle(final[0]*size, final[1]*size, (final[0]+1)*size, (final[1]+1)*size, fill="green", width=1)

    x0 = player[0]*size + size / 2 - r
    y0 = player[1]*size + size / 2 - r
    x1 = player[0]*size + size / 2 + r
    y1 = player[1]*size + size / 2 + r

    me = board.create_oval(x0, y0, x1, y1, fill="red", width=1)    

def move(dx, dy):
    global player, me, restart, board
    new_x = player[0] + dx
    new_y = player[1] + dy
    score = 0
    
    if new_x >= 0 and new_y >= 0 and new_x < max_x and new_y < max_y:
        x0 = new_x*size + size / 2 - r
        y0 = new_y*size + size / 2 - r
        x1 = new_x*size + size / 2 + r
        y1 = new_y*size + size / 2 + r

        board.coords(me, x0, y0, x1, y1)
        player = (new_x, new_y)
    else:
        score = -1
        
    if (new_x, new_y) == final:
        score = 1
        restart = True
    
    return score

def move_up(event=None):
    return move(0, -1)

def move_down(event=None):
    return move(0, 1)

def move_left(event=None):
    return move(-1, 0)

def move_right(event=None):
    return move(1, 0)

def begin():
    global master, board
    create_board()
    board.grid(row=0, column=0)
    master.mainloop()

### Q-Learning

**Parâmetros**

In [2]:
Q = {}
actions = [move_up, move_down, move_left, move_right]
alpha = 1
gamma = 0.9
score = 1
t = 1

from enum import Enum

class Return(Enum):
    ACTION = 0
    VALUE = 1

**Função Q***

In [3]:
def Q_optimum(state, return_type):
    global Q
    action_values = Q[state]
    
    max_value = None
    best_action = None
    
    for action, action_value in enumerate(action_values):
        if max_value is None or action_value > max_value:
            max_value = action_value
            best_action = action
    
    if return_type == Return.ACTION:
        return best_action
    elif return_type == Return.VALUE:
        return max_value
    else:
        raise ValueError('Parameter return_type is not defined')

**Função para atualizar a matriz Q**

In [4]:
def update_Q(state, new_state, action, reward):
    Q[state][action] = (1-alpha)*Q_optimum(state, Return.VALUE) + alpha*(reward + gamma*Q_optimum(new_state, Return.VALUE))

In [5]:
def execute(action):
    # Executando a ação recebida
    reward = actions[action]()
    new_state = player
    
    return new_state, reward

In [6]:
def initialize_Q():
    for x in range(max_x):
        for y in range(max_y):
            Q[(x,y)] = [0.1 for _ in range(len(actions))]

In [7]:
def draw_arrows():
    global board, Q, arrow_ids
    policy = {}
    
    arrows = {
        0: (50,70,50,30), # up
        1: (50,30,50,70), # down
        2: (70,50,30,50), # left
        3: (30,50,70,50)  # right
    }
    
    for state, actions in Q.items():
        best_actions = {}
        best_index = None
        
        for index, action in enumerate(actions):
            if len(best_actions) == 0 or action > best_actions[best_index]:
                best_index = index
                best_actions = {}
                best_actions[index] = action
            elif action == best_actions[best_index]:
                best_actions[index] = action
        
        policy[state] = best_actions
        
        for action in best_actions:
            if state == final:
                continue
            x0,y0,x1,y1 = arrows[action]
            arrow_id = board.create_line(x0 + size * state[0], 
                                         y0 + size * state[1], 
                                         x1 + size * state[0],
                                         y1 + size * state[1], 
                                         arrow=LAST)
            arrow_ids.append(arrow_id)
    return policy

In [8]:
import time 
import threading

def main():
    global player, Q, restart, r, me, board, arrow_ids
    time.sleep(10)
    initialize_Q()
    state = player
    while True:
        action = Q_optimum(state, Return.ACTION)
        new_state, reward = execute(action)
        update_Q(state, new_state, action, reward)
        state = new_state
        draw_arrows()
       
        if restart:
            time.sleep(.5)
            player = (0,1)
            state = player
            restart = False
            for arrow_id in arrow_ids:
                board.delete(arrow_id)
            arrow_ids = []
            x0 = player[0]*size + size / 2 - r
            y0 = player[1]*size + size / 2 - r
            x1 = player[0]*size + size / 2 + r
            y1 = player[1]*size + size / 2 + r
            board.coords(me, x0, y0, x1, y1)
            
        time.sleep(.2)
        
t = threading.Thread(target=main)
t.daemon = True
t.start()
begin()