In [1]:
import numpy as np

class GridWorld:
    def __init__(self, size=2):
        self.size = size
        self.n_states = size * size
        self.n_actions = 4  # arriba, abajo, izquierda, derecha

        # Definir estados especiales
        self.start_state = (0, 0)
        self.goal_state = (1, 1)
        self.obstacles = [(0, 1)]

        # Mapeo de acciones a movimientos
        self.actions = {
            0: (-1, 0),  # arriba
            1: (1, 0),   # abajo
            2: (0, -1),  # izquierda
            3: (0, 1)    # derecha
        }

        # Crear matrices de transición y recompensa
        self.P = self._build_transition_matrix()
        self.R = self._build_reward_matrix()

    def _pos_to_state(self, pos):
        return pos[0] * self.size + pos[1]

    def _state_to_pos(self, state):
        return (state // self.size, state % self.size)

    def _is_valid_position(self, pos):
        return 0 <= pos[0] < self.size and 0 <= pos[1] < self.size

    def _build_transition_matrix(self):
        P = np.zeros((self.n_states, self.n_actions, self.n_states))
        for state in range(self.n_states):
            pos = self._state_to_pos(state)
            if pos == self.goal_state:
                for action in range(self.n_actions):
                    P[state][action][state] = 1.0
                continue
            for action in range(self.n_actions):
                move = self.actions[action]
                new_pos = (pos[0] + move[0], pos[1] + move[1])
                if self._is_valid_position(new_pos):
                    new_state = self._pos_to_state(new_pos)
                    P[state][action][new_state] = 1.0
                else:
                    P[state][action][state] = 1.0
        return P

    def _build_reward_matrix(self):
        R = np.full((self.n_states, self.n_actions, self.n_states), -1.0)
        for state in range(self.n_states):
            for action in range(self.n_actions):
                for next_state in range(self.n_states):
                    if self.P[state][action][next_state] > 0:
                        next_pos = self._state_to_pos(next_state)
                        if next_pos == self.goal_state:
                            R[state][action][next_state] = 10.0
                        elif next_pos in self.obstacles:
                            R[state][action][next_state] = -5.0
                        else:
                            R[state][action][next_state] = -1.0
        return R

    def value_iteration(self, gamma=0.9, theta=1e-6, max_iterations=1000):
        V = np.zeros(self.n_states)
        for iteration in range(max_iterations):
            V_old = V.copy()
            for s in range(self.n_states):
                action_values = []
                for a in range(self.n_actions):
                    value = 0
                    for s_prime in range(self.n_states):
                        prob = self.P[s][a][s_prime]
                        reward = self.R[s][a][s_prime]
                        value += prob * (reward + gamma * V_old[s_prime])
                    action_values.append(value)
                V[s] = max(action_values)
            if np.max(np.abs(V - V_old)) < theta:
                break
        return V

    def extract_policy(self, V, gamma=0.9):
        policy = np.zeros(self.n_states, dtype=int)
        for s in range(self.n_states):
            action_values = []
            for a in range(self.n_actions):
                value = 0
                for s_prime in range(self.n_states):
                    prob = self.P[s][a][s_prime]
                    reward = self.R[s][a][s_prime]
                    value += prob * (reward + gamma * V[s_prime])
                action_values.append(value)
            policy[s] = np.argmax(action_values)
        return policy

    def print_policy_matrix(self, policy):
        arrow_symbols = ['^', 'v', '<', '>']
        for i in range(self.size):
            row_str = ""
            for j in range(self.size):
                state = self._pos_to_state((i, j))
                row_str += arrow_symbols[policy[state]] + " "
            print(row_str)
        print("-" * 20)

    def print_value_matrix(self, V):
        V_grid = V.reshape(self.size, self.size)
        for i in range(self.size):
            row_str = ""
            for j in range(self.size):
                row_str += f"{V_grid[i,j]:6.2f} "
            print(row_str)
        print("-" * 20)


# === Uso ===
grid = GridWorld(size=2)  # cambia tamaño si quieres

V_optimal = grid.value_iteration(gamma=0.9)
policy_optimal = grid.extract_policy(V_optimal, gamma=0.9)

print("Matriz de política:")
grid.print_policy_matrix(policy_optimal)

print("Matriz de valores:")
grid.print_value_matrix(V_optimal)

Matriz de política:
v v 
> ^ 
--------------------
Matriz de valores:
 89.00 100.00 
100.00 100.00 
--------------------
