In [1]:
import numpy as np
import random
import numpy as np

def print_pretty_matrix(title, matrix):
    if not isinstance(matrix, np.ndarray):
        raise TypeError("Input must be a NumPy array")
    
    rows, cols = matrix.shape
    print(f"\n\n{title}\n********")
    for row in range(rows):
        formatted_row = " | ".join(f"{matrix[row, col]:>10}" for col in range(cols))
        print(f"| {formatted_row} |")

In [19]:
# Parámetros del Grid World
grid_size = 4
start = (0, 0)
goal = (3, 3)
learning_rate = 0.0005
discount_factor = 0.9
epsilon = 0.1
episodes = 10000
# Definir las acciones
actions = ["up", "down", "left", "right"]
action_to_index = {action: i for i, action in enumerate(actions)}

# Definir la matriz de recompensas del Grid World
rewards = np.full((grid_size, grid_size), -0.1)  # Penalización por movimiento
rewards[1, 3] = 1  # Recompensa positiva
rewards[2, 2] = 1  # Recompensa positiva
rewards[1, 1] = -1  # Recompensa negativa
rewards[3, 0] = -1  # Recompensa negativa
rewards[goal] = 100  # Recompensa por llegar a la meta

print_pretty_matrix("rewards", rewards)



rewards
********
|       -0.1 |       -0.1 |       -0.1 |       -0.1 |
|       -0.1 |       -1.0 |       -0.1 |        1.0 |
|       -0.1 |       -0.1 |        1.0 |       -0.1 |
|       -1.0 |       -0.1 |       -0.1 |      100.0 |


In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Definir la red neuronal para aproximar la función Q
class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(grid_size * grid_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 4)  # 4 acciones posibles: arriba, abajo, izquierda, derecha

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

# Inicializar la red DQN
dqn = DQN()
optimizer = optim.Adam(dqn.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

# Función para convertir el estado (posición en la cuadrícula) en un tensor de entrada para la red
def state_to_tensor(state):
    tensor = torch.zeros(grid_size * grid_size)
    tensor[state[0] * grid_size + state[1]] = 1.0
    return tensor.unsqueeze(0).float()

# Función para elegir la acción (con epsilon-greedy)
def choose_action(state):
    if random.uniform(0, 1) < epsilon:
        return random.randint(0, 3)  # Acción aleatoria
    else:
        with torch.no_grad():
            q_values = dqn(state_to_tensor(state))
        return torch.argmax(q_values).item()

# Función para obtener la próxima posición dada una acción
def take_action(state, action):
    if action == 0:  # up
        return (max(state[0] - 1, 0), state[1])
    elif action == 1:  # down
        return (min(state[0] + 1, grid_size - 1), state[1])
    elif action == 2:  # left
        return (state[0], max(state[1] - 1, 0))
    elif action == 3:  # right
        return (state[0], min(state[1] + 1, grid_size - 1))

# Entrenamiento con DQN
for episode in range(episodes):
    state = start

    if episode % (episodes // 10) == 0:
        print(episode)

    while state != goal:
        action = choose_action(state)
        next_state = take_action(state, action)

        # Obtener la recompensa correspondiente
        reward = rewards[next_state]

        # Calcular el valor Q objetivo
        with torch.no_grad():
            next_q_values = dqn(state_to_tensor(next_state))
            max_next_q_value = torch.max(next_q_values).item()

        target_q_value = reward + discount_factor * max_next_q_value

        # Obtener el valor Q estimado actual
        q_values = dqn(state_to_tensor(state))
        current_q_value = q_values[0, action]

        # Convertir el objetivo a float32 para que coincida con el tipo de datos del tensor de PyTorch
        target_q_value = torch.tensor(target_q_value).float()

        # Calcular la pérdida
        loss = loss_fn(current_q_value, torch.tensor(target_q_value))

        # Actualizar la red
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Mover al siguiente estado
        state = next_state

# Mostrar la matriz de recompensas para referencia
print("\nMatriz de Recompensas:")
print(rewards)

0


  loss = loss_fn(current_q_value, torch.tensor(target_q_value))


1000
2000
3000
4000
5000
6000
7000
8000
9000

Matriz de Recompensas:
[[ -0.1  -0.1  -0.1  -0.1]
 [ -0.1  -1.   -0.1   1. ]
 [ -0.1  -0.1   1.   -0.1]
 [ -1.   -0.1  -0.1 100. ]]


In [22]:
loss.item()

0.029872387647628784

In [23]:
for i in range(4):
    for j in range(4):
        print(f"Q({i},{j}): {dqn(state_to_tensor((i, j)))}")

Q(0,0): tensor([[134.2445, 147.0143, 132.8422, 148.6153]], grad_fn=<AddmmBackward0>)
Q(0,1): tensor([[150.1373, 164.6835, 134.6351, 165.0654]], grad_fn=<AddmmBackward0>)
Q(0,2): tensor([[166.9176, 183.9651, 149.3241, 183.0212]], grad_fn=<AddmmBackward0>)
Q(0,3): tensor([[174.6132, 204.0490, 167.4269, 183.5471]], grad_fn=<AddmmBackward0>)
Q(1,0): tensor([[134.1128, 162.7217, 144.8471, 164.0856]], grad_fn=<AddmmBackward0>)
Q(1,1): tensor([[149.7819, 183.1437, 149.9641, 185.1521]], grad_fn=<AddmmBackward0>)
Q(1,2): tensor([[166.7669, 204.9561, 167.0045, 205.0407]], grad_fn=<AddmmBackward0>)
Q(1,3): tensor([[184.9557, 226.7475, 185.6731, 204.0625]], grad_fn=<AddmmBackward0>)
Q(2,0): tensor([[152.3635, 179.8279, 160.6451, 185.0702]], grad_fn=<AddmmBackward0>)
Q(2,1): tensor([[165.8397, 200.1668, 164.0280, 205.9291]], grad_fn=<AddmmBackward0>)
Q(2,2): tensor([[184.3490, 224.4703, 184.1567, 226.7924]], grad_fn=<AddmmBackward0>)
Q(2,3): tensor([[205.8872, 251.8546, 205.4970, 226.7765]], grad_f