In [9]:
import numpy as np
import random
import numpy as np

def print_pretty_matrix(title, matrix):
    if not isinstance(matrix, np.ndarray):
        raise TypeError("Input must be a NumPy array")
    
    rows, cols = matrix.shape
    print(f"\n\n{title}\n********")
    for row in range(rows):
        formatted_row = " | ".join(f"{matrix[row, col]:>10}" for col in range(cols))
        print(f"| {formatted_row} |")

In [14]:
# Parámetros del Grid World
grid_size = 4
start = (0, 0)
goal = (3, 3)
learning_rate = 0.1
discount_factor = 0.9
epsilon = 0.1
episodes = 2000

# Inicializar la tabla Q
Q = np.zeros((grid_size, grid_size, 4))  # 4 acciones posibles: arriba, abajo, izquierda, derecha

# Definir las acciones
actions = ["up", "down", "left", "right"]
action_to_index = {action: i for i, action in enumerate(actions)}

# Definir la matriz de recompensas del Grid World
rewards = np.full((grid_size, grid_size), -0.1)  # Penalización por movimiento
rewards[1, 3] = 1  # Recompensa positiva
rewards[2, 2] = 1  # Recompensa positiva
rewards[1, 1] = -1  # Recompensa negativa
rewards[3, 0] = -1  # Recompensa negativa
rewards[goal] = 100  # Recompensa por llegar a la meta

for i in range(4):
    print_pretty_matrix(f"Q({i})", Q[:, :, 1])
print_pretty_matrix("rewards", rewards)



Q(0)
********
|        0.0 |        0.0 |        0.0 |        0.0 |
|        0.0 |        0.0 |        0.0 |        0.0 |
|        0.0 |        0.0 |        0.0 |        0.0 |
|        0.0 |        0.0 |        0.0 |        0.0 |


Q(1)
********
|        0.0 |        0.0 |        0.0 |        0.0 |
|        0.0 |        0.0 |        0.0 |        0.0 |
|        0.0 |        0.0 |        0.0 |        0.0 |
|        0.0 |        0.0 |        0.0 |        0.0 |


Q(2)
********
|        0.0 |        0.0 |        0.0 |        0.0 |
|        0.0 |        0.0 |        0.0 |        0.0 |
|        0.0 |        0.0 |        0.0 |        0.0 |
|        0.0 |        0.0 |        0.0 |        0.0 |


Q(3)
********
|        0.0 |        0.0 |        0.0 |        0.0 |
|        0.0 |        0.0 |        0.0 |        0.0 |
|        0.0 |        0.0 |        0.0 |        0.0 |
|        0.0 |        0.0 |        0.0 |        0.0 |


rewards
********
|       -0.1 |       -0.1 |       -0.1 |       -0.1 |

In [15]:
# Función para obtener la próxima acción (con epsilon-greedy)
def choose_action(state):
    if random.uniform(0, 1) < epsilon:
        return random.choice(actions)
    else:
        return actions[np.argmax(Q[state[0], state[1], :])]

# Función para obtener la próxima posición dada una acción
def take_action(state, action):
    if action == "up":
        return (max(state[0] - 1, 0), state[1])
    elif action == "down":
        return (min(state[0] + 1, grid_size - 1), state[1])
    elif action == "left":
        return (state[0], max(state[1] - 1, 0))
    elif action == "right":
        return (state[0], min(state[1] + 1, grid_size - 1))

# Entrenamiento con Q-learning
for episode in range(episodes):
    state = start

    while state != goal:
        action = choose_action(state)
        next_state = take_action(state, action)

        # Obtener la recompensa correspondiente
        reward = rewards[next_state]

        # Actualizar la tabla Q
        old_value = Q[state[0], state[1], action_to_index[action]]
        next_max = np.max(Q[next_state[0], next_state[1], :])
        Q[state[0], state[1], action_to_index[action]] = old_value + learning_rate * (reward + discount_factor * next_max - old_value)

        # Mover al siguiente estado
        state = next_state

# Mostrar la tabla Q final
for i in range(grid_size):
    for j in range(grid_size):
        print(f"Q({i},{j}): {Q[i, j, :]}")

# Mostrar la matriz de recompensas para referencia
print("\nMatriz de Recompensas:")
print(rewards)


Q(0,0): [53.14790797 47.99136465 53.01702197 59.44139   ]
Q(0,1): [59.35462905 64.00460119 53.22759133 66.1571    ]
Q(0,2): [65.96036809 73.619      59.04152736 71.02100539]
Q(0,3): [-1.00000000e-02  8.14569914e+01  0.00000000e+00  1.60861756e+01]
Q(1,0): [-4.15750249e-02  6.14619011e+01  1.35945340e+01  1.74205995e+01]
Q(1,1): [11.23676458 24.98592398 10.53432164 73.61556424]
Q(1,2): [65.965345   81.91       64.83141537 80.92382646]
Q(1,3): [ 3.82670904 89.8999912  30.40425298 27.59368375]
Q(2,0): [-1.99000000e-02 -2.65557183e-01 -1.99000000e-02  7.27325144e+01]
Q(2,1): [-0.1         1.46925203  7.88369675 81.90705241]
Q(2,2): [73.26805374 72.9608166  72.61068649 89.9       ]
Q(2,3): [ 81.67173688 100.          81.76840577  89.4279129 ]
Q(3,0): [ 6.53523043 -0.18455718 -0.1        -0.01      ]
Q(3,1): [-1.00000000e-02 -1.00000000e-02 -1.00900000e-01  2.74367081e+01]
Q(3,2): [81.82913325 13.35369281  3.31599007 10.        ]
Q(3,3): [0. 0. 0. 0.]

Matriz de Recompensas:
[[ -0.1  -0.1  -