# Task 3
## Proceso de Decisión de Markov

In [237]:
import numpy as np

# Semilla aleatoria para reproductibilidad
np.random.seed(43)

### Entorno Frozen Lake

In [238]:
class FrozenLakeEnvironment:
    def __init__(self):
        self.grid_size = 4
        self.states = ['S', 'F', 'H', 'G']  # Start, Frozen, Hole, Goal
        self.grid = np.full((self.grid_size, self.grid_size), 'F')  # Colocar las celdas como frozen
        self.env = None
        self.desc = None
        # Elegir la posicion inicial aleatoria y asignar la meta en la esquina opuesta
        start_positions = [(0, 0), (0, self.grid_size-1), (self.grid_size-1, 0), (self.grid_size-1, self.grid_size-1)]
        start_pos = start_positions[np.random.choice(len(start_positions))]
        goal_pos = (self.grid_size-1-start_pos[0], self.grid_size-1-start_pos[1])
        self.grid[start_pos] = 'S'
        self.grid[goal_pos] = 'G'
        self.start_pos = start_pos
        self.goal_pos = goal_pos
        
    def _place_holes(self):
        # Determinar los hoyos y sus posiciones
        num_holes = np.random.randint(1, 4)
        
        available_positions = [(i, j) for i in range(self.grid_size) for j in range(self.grid_size)
                               if self.grid[i, j] == 'F']
        
        hole_positions = np.random.choice(range(len(available_positions)), size=num_holes, replace=False)
        for pos_index in hole_positions:
            self.grid[available_positions[pos_index]] = 'H'

    def setup_environment(self):
        self._place_holes()

# Inicializar el entorno
frozen_lake = FrozenLakeEnvironment()
frozen_lake.setup_environment()

print("El frozen lake quedo de la siguiente manera:")
print(frozen_lake.grid)

El frozen lake quedo de la siguiente manera:
[['S' 'F' 'F' 'F']
 ['F' 'H' 'F' 'F']
 ['F' 'F' 'F' 'F']
 ['F' 'F' 'F' 'G']]


## Resolver el Frozen Lake

In [239]:
# Definir el algoritmo para encontrar la política óptima
def find_optimal_policy(env, gamma=0.99, theta=1e-8):
    num_states = env.grid_size ** 2
    num_actions = 4  # Up, Down, Left, Right
    value_table = np.zeros(num_states)
    rewards = np.full((num_states, num_actions), -1.0)  # Recompensas por defecto
    transitions = np.zeros((num_states, num_actions, num_states))  # Probabilidades de transición

    # Establecer recompensas y transiciones
    for x in range(env.grid_size):
        for y in range(env.grid_size):
            current_state = x * env.grid_size + y
            for action in range(num_actions):
                new_x, new_y = x, y
                if env.grid[x, y] in ['G', 'H']:  # No hay transiciones desde estados terminales
                    transitions[current_state, action, current_state] = 1.0
                    rewards[current_state, action] = 0.0
                else:
                    if action == 0 and x > 0:  # Arriba
                        new_x -= 1
                    elif action == 1 and x < env.grid_size - 1:  # Abajo
                        new_x += 1
                    elif action == 2 and y > 0:  # Izquierda
                        new_y -= 1
                    elif action == 3 and y < env.grid_size - 1:  # Derecha
                        new_y += 1
                    new_state = new_x * env.grid_size + new_y
                    transitions[current_state, action, new_state] = 1.0
                    if env.grid[new_x, new_y] == 'G':
                        rewards[current_state, action] = 100.0
                    elif env.grid[new_x, new_y] == 'H':
                        rewards[current_state, action] = -100.0

    # Iteración de valor
    while True:
        delta = 0
        for state in range(num_states):
            v = value_table[state]
            value_table[state] = max([sum([transitions[state, action, next_state] * 
                                           (rewards[state, action] + gamma * value_table[next_state]) 
                                           for next_state in range(num_states)]) 
                                      for action in range(num_actions)])
            delta = max(delta, abs(v - value_table[state]))
        if delta < theta:
            break

    # Extraer política óptima
    policy = np.zeros(num_states, dtype=int)
    for state in range(num_states):
        action_values = np.zeros(num_actions)
        for action in range(num_actions):
            for next_state in range(num_states):
                action_values[action] += transitions[state, action, next_state] * \
                                         (rewards[state, action] + gamma * value_table[next_state])
        policy[state] = np.argmax(action_values)

    return policy

# Convertir la política en acciones legibles
def policy_to_directions(policy, grid_size):
    directions = ['Up', 'Down', 'Left', 'Right']
    policy_directions = np.array([directions[action] for action in policy]).reshape(grid_size, grid_size)
    return policy_directions

optimal_policy = find_optimal_policy(frozen_lake)
policy_directions = policy_to_directions(optimal_policy, frozen_lake.grid_size)

# Mostrar la política óptima
for i in range(frozen_lake.grid_size):
    for j in range(frozen_lake.grid_size):
        print(f"En el estado ({i}, {j}): la política óptima es: {policy_directions[i, j]}")

En el estado (0, 0): la política óptima es: Down
En el estado (0, 1): la política óptima es: Right
En el estado (0, 2): la política óptima es: Down
En el estado (0, 3): la política óptima es: Down
En el estado (1, 0): la política óptima es: Down
En el estado (1, 1): la política óptima es: Up
En el estado (1, 2): la política óptima es: Down
En el estado (1, 3): la política óptima es: Down
En el estado (2, 0): la política óptima es: Down
En el estado (2, 1): la política óptima es: Down
En el estado (2, 2): la política óptima es: Down
En el estado (2, 3): la política óptima es: Down
En el estado (3, 0): la política óptima es: Right
En el estado (3, 1): la política óptima es: Right
En el estado (3, 2): la política óptima es: Right
En el estado (3, 3): la política óptima es: Up
