In [1]:
import numpy as np

class GridWorldQLearning:
    def __init__(self, size=4, obstacles=None, start=(0,0), goal=(3,3)):
        self.size = size
        self.n_states = size * size
        self.n_actions = 4  # arriba, abajo, izquierda, derecha

        self.start_state = start
        self.goal_state = goal
        self.obstacles = obstacles if obstacles else [(1,1),(2,2)]

        # Acciones
        self.actions = {
            0: (-1, 0),  # arriba
            1: (1, 0),   # abajo
            2: (0, -1),  # izquierda
            3: (0, 1)    # derecha
        }

    def _pos_to_state(self, pos):
        return pos[0] * self.size + pos[1]

    def _state_to_pos(self, state):
        return (state // self.size, state % self.size)

    def _is_valid_position(self, pos):
        return 0 <= pos[0] < self.size and 0 <= pos[1] < self.size

    def step(self, state, action):
        """Ejecuta la acción y devuelve next_state, reward, done"""
        pos = self._state_to_pos(state)
        move = self.actions[action]
        new_pos = (pos[0] + move[0], pos[1] + move[1])

        if not self._is_valid_position(new_pos):
            new_pos = pos  # quedarse en el mismo lugar

        reward = -1  # costo normal
        if new_pos in self.obstacles:
            reward = -5
        if new_pos == self.goal_state:
            reward = 10

        done = new_pos == self.goal_state
        next_state = self._pos_to_state(new_pos)
        return next_state, reward, done

    def q_learning(self, episodes=500, alpha=0.5, gamma=0.9, epsilon=0.1):
        Q = np.zeros((self.n_states, self.n_actions))

        for ep in range(episodes):
            state = self._pos_to_state(self.start_state)
            done = False

            while not done:
                # epsilon-greedy
                if np.random.rand() < epsilon:
                    action = np.random.randint(self.n_actions)
                else:
                    action = np.argmax(Q[state])

                next_state, reward, done = self.step(state, action)

                # Q-Learning update
                Q[state, action] = Q[state, action] + alpha * (
                    reward + gamma * np.max(Q[next_state]) - Q[state, action]
                )

                state = next_state

        # Extraer política final
        policy = np.argmax(Q, axis=1)
        return Q, policy

    def print_policy_matrix(self, policy):
      arrow_symbols = ['^', 'v', '<', '>']
      for i in range(self.size):
          row_str = ""
          for j in range(self.size):
              state = self._pos_to_state((i,j))
              row_str += arrow_symbols[policy[state]] + " "
          print(row_str)
      print("-"*20)

    def print_q_max_matrix(self, Q):
        """Imprime el valor máximo Q(s,a) de cada casilla en formato matriz"""
        Q_max = np.max(Q, axis=1).reshape(self.size, self.size)
        for i in range(self.size):
            row_str = ""
            for j in range(self.size):
              row_str += f"{Q_max[i,j]:5.2f} "
            print(row_str)
        print("-"*20)

grid = GridWorldQLearning()
Q, policy = grid.q_learning(episodes=1000, alpha=0.5, gamma=0.9, epsilon=0.1)

print("Política aprendida por Q-Learning:")
grid.print_policy_matrix(policy)

print("Valores Q máximos por casilla:")
grid.print_q_max_matrix(Q)

Política aprendida por Q-Learning:
> > v v 
v > > v 
v v > v 
> > > ^ 
--------------------
Valores Q máximos por casilla:
 1.81  3.12  4.58  6.20 
 3.12  4.57  6.20  8.00 
 4.58  6.08  8.00 10.00 
 6.20  8.00 10.00  0.00 
--------------------
