In [14]:
import numpy as np
import pandas as pd
import gym

In [18]:
# Crie o ambiente personalizado
class CustomEnv(gym.Env):
    def __init__(self):
        self.grid_size = 5  # Tamanho da grade
        self.start_state = (0, 0)  # Estado inicial
        self.goal_state = (4, 4)  # Estado objetivo
        self.current_state = self.start_state  # Estado atual
        self.action_space = gym.spaces.Discrete(4)  # Espaço de ação (cima, baixo, esquerda, direita)
        self.observation_space = gym.spaces.Tuple((
            gym.spaces.Discrete(self.grid_size),
            gym.spaces.Discrete(self.grid_size)
        ))  # Espaço de observação

    def reset(self):
        self.current_state = self.start_state
        return self.current_state

    def step(self, action):
        x, y = self.current_state

        if action == 0:  # Cima
            x = max(0, x - 1)
        elif action == 1:  # Baixo
            x = min(self.grid_size - 1, x + 1)
        elif action == 2:  # Esquerda
            y = max(0, y - 1)
        elif action == 3:  # Direita
            y = min(self.grid_size - 1, y + 1)

        self.current_state = (x, y)

        done = (self.current_state == self.goal_state)
        reward = 1 if done else 0

        return self.current_state, reward, done, {}

    def render(self):
        grid = np.zeros((self.grid_size, self.grid_size), dtype=np.int8)
        grid[self.goal_state[0], self.goal_state[1]] = 2
        grid[self.current_state[0], self.current_state[1]] = 1

        for row in grid:
            print(' '.join(map(str, row)))


# Algoritmo Q-Learning
def q_learning(env, num_episodes, alpha, gamma, epsilon):
    q_table = np.zeros((env.grid_size, env.grid_size, env.action_space.n))

    for episode in range(num_episodes):
        state = env.reset()
        done = False

        while not done:
            if np.random.rand() < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state])

            next_state, reward, done, _ = env.step(action)

            q_value = q_table[state][action]
            next_max_q = np.max(q_table[next_state])

            q_table[state][action] += alpha * (reward + gamma * next_max_q - q_value)
            state = next_state

    return q_table


# Função principal
def main():
    env = CustomEnv()

    # Parâmetros do Q-Learning
    num_episodes = 1000
    alpha = 0.5
    gamma = 0.9
    epsilon = 0.1

    # Executa o algoritmo Q-Learning
    q_table = q_learning(env, num_episodes, alpha, gamma, epsilon)

    # Imprime a tabela Q final
    print("Tabela Q final:")
    print(q_table)

    # Testa a política aprendida
    state = env.reset()
    done = False

    print("Caminho encontrado:")
    env.render()

    while not done:
        action = np.argmax(q_table[state])
        state, _, done, _ = env.step(action)
        env.render()

    print("Chegou ao objetivo!")


if __name__ == "__main__":
    main()


Tabela Q final:
[[[0.43046721 0.38742049 0.43046721 0.4782969 ]
  [0.4782969  0.43046679 0.43046711 0.531441  ]
  [0.531441   0.47829686 0.4782969  0.59049   ]
  [0.59049    0.6561     0.53144087 0.531441  ]
  [0.26565557 0.         0.59049    0.39857974]]

 [[0.43046721 0.         0.         0.21523319]
  [0.47829687 0.         0.33889831 0.        ]
  [0.531441   0.24198014 0.21438054 0.492075  ]
  [0.59049    0.71562669 0.47829504 0.729     ]
  [0.531441   0.81       0.6561     0.72899999]]

 [[0.         0.         0.         0.        ]
  [0.         0.         0.         0.        ]
  [0.41839296 0.         0.         0.36165234]
  [0.6561     0.45605357 0.16132006 0.80990112]
  [0.729      0.9        0.6141825  0.81      ]]

 [[0.         0.         0.         0.        ]
  [0.         0.         0.         0.        ]
  [0.         0.         0.         0.        ]
  [0.7235147  0.09675943 0.         0.675     ]
  [0.81       1.         0.63183103 0.89999999]]

 [[0.         0.