# Examen Corto 1
## Task Frozen Lake
### Integrantes:
- Diego Leiva       21752
- Maria Ramirez     21342
- Gustavo Gonzalez  21438

**Librerias Necesarias**

In [2]:
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
import numpy as np
import warnings

**Configuracion inicial**

In [3]:
warnings.simplefilter('ignore')
np.random.seed(42)

**Crear el entorno Frozen Lake**

Estados:
- S: starting point, seguro
- F: frozen surface, seguro
- H: hole, atrapado
- G: goal, seguro

Acciones:
- 0: Move left
- 1: Move down
- 2: Move right
- 3: Move up

Recompensas:
- Reach goal: +1
- Reach hole: 0
- Reach frozen: 0

Probabilidades:
- P(intended) = 1/3
- P(perpendicular direction) = 1/3
- P(perpendicular direction) = 1/3

In [4]:
# Crear un nuevo entorno
env = gym.make("FrozenLake-v1", 
               desc = generate_random_map(size=4),  # Generar un mapa aleatorio
               map_name = "4x4",                    # Mapa 4x4
               is_slippery = True,                  # Entorno resbaladizo
               render_mode="ansi")                  # Modo de renderizado
env.reset()

# Visualizar el entorno
print(env.render())


[41mS[0mFFH
FFFF
FFFF
HFFG



**Q-Learning**

In [5]:
# Inicializacion de la Q table
qtable = np.zeros((env.observation_space.n, env.action_space.n))

# Definicion de Hiperparametros:
episodes = 100000
alpha = 0.9                 # Tasa de aprendizaje
gamma = 0.9                 # Tasa de descuento
epsilon = 1.0               # Aletoriedad en la seleccion de acciones
epsilon_decay = 0.0001      # Tasa de decaiminto de epsilon

# Mostrar la tabla
print('Q-table antes del entrenamiento: ')
print(qtable)

Q-table antes del entrenamiento: 
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


**Entrenamiento del Agente**

In [6]:
# Por cada episodio, explorar el entorno
for i in range(episodes):
    state = env.reset()[0]
    terminated = False
    truncated = False

    # Mientras que el agente no caiga en un hoyo o alcanze la meta, continuar entrenando
    while not truncated and not terminated:
        # Generar un numero aleatorio entre 0 y 1
        rnd = np.random.random()

        # Si el aleatorio es menor a epsilon, tomar una accion aleatoria
        if rnd < epsilon:
            action = env.action_space.sample()
        # de lo contrario, tomar la accion con mayor valor
        else:
            action = np.argmax(qtable[state])

        # Ejecutar la accion y mover el agente
        new_state, reward, terminated, truncated, info = env.step(action)

        # Actualizar Q(s,a)
        qtable[state, action] = qtable[state, action] + alpha * (
            reward + gamma * np.max(qtable[new_state]) - qtable[state, action]
            )
        
        # Actualizar el estado actual
        state = new_state

    # Actualizar el valor de epsilon
    epsilon = max(epsilon - epsilon_decay, 0)

    if epsilon == 0:
        alpha = 0.0001
            
# Visualizar la Q-table actualizada
print('Q-table despues del entrenamiento:')
print(f"{qtable}\n")

Q-table despues del entrenamiento:
[[0.04001333 0.20408121 0.04221602 0.06042139]
 [0.05118376 0.04343698 0.24549148 0.05124135]
 [0.26802126 0.00794028 0.00449194 0.02506428]
 [0.         0.         0.         0.        ]
 [0.05210756 0.05342699 0.05118824 0.222903  ]
 [0.05766661 0.06039997 0.29934615 0.06025739]
 [0.06802976 0.06632403 0.37476753 0.09758846]
 [0.056617   0.46856279 0.03917581 0.04038703]
 [0.0088867  0.01204284 0.01753667 0.06891997]
 [0.05083247 0.05720338 0.370922   0.07217753]
 [0.11241061 0.50640837 0.12603504 0.12838157]
 [0.14955211 0.69522522 0.16528515 0.10150858]
 [0.         0.         0.         0.        ]
 [0.04697917 0.05294616 0.42518185 0.06980289]
 [0.19999408 0.19755945 0.24083521 0.61205643]
 [0.         0.         0.         0.        ]]



**Evaluacion del entrenamiento**

In [7]:
success = 0

# Evaluar la tasa de exito
for i in range(episodes):
    state = env.reset()[0]
    terminated = False
    truncated = False
    done = truncated and terminated

    while not done:
        action = np.argmax(qtable[state])

        new_state, reward, truncated, terminated, info = env.step(action)
        done = truncated and terminated

        state = new_state
        success += reward

# Obtener la tasa de exito
print(f"Tasa de exito: {round(success/episodes*100,2)}%")

Tasa de exito: 100.0%


**Visualizar la politica aprendida**

In [8]:
state = env.reset()[0]
terminated = False
truncated = False

sequence = []
print(env.render())

while not terminated and not truncated:
    if np.max(qtable[state]) > 0:
      action = np.argmax(qtable[state])
    else:
      action = env.action_space.sample()
    
    # Agregar la accion a la secuencia
    sequence.append(action)

    # Ejecutar la accion y mover el agente
    new_state, reward, terminated, truncated, info = env.step(action)

    # Actualizar el estado actual
    state = new_state

    # Visualizar el movimiento
    print(env.render())


# Definir el mapeo de acciones
action_mapping = {
    0: 'LEFT',
    1: 'DOWN',
    2: 'RIGHT',
    3: 'UP'
}

# Mapear los valores numericos a direcciones
mapped_sequence = [action_mapping[action] for action in sequence]

# Mostrar la secuencia
print(f"Secuencia: {mapped_sequence}")


[41mS[0mFFH
FFFF
FFFF
HFFG

  (Down)
S[41mF[0mFH
FFFF
FFFF
HFFG

  (Right)
SFFH
F[41mF[0mFF
FFFF
HFFG

  (Right)
SFFH
FF[41mF[0mF
FFFF
HFFG

  (Right)
SFFH
FFF[41mF[0m
FFFF
HFFG

  (Down)
SFFH
FF[41mF[0mF
FFFF
HFFG

  (Right)
SF[41mF[0mH
FFFF
FFFF
HFFG

  (Left)
SF[41mF[0mH
FFFF
FFFF
HFFG

  (Left)
SF[41mF[0mH
FFFF
FFFF
HFFG

  (Left)
S[41mF[0mFH
FFFF
FFFF
HFFG

  (Right)
S[41mF[0mFH
FFFF
FFFF
HFFG

  (Right)
SFFH
F[41mF[0mFF
FFFF
HFFG

  (Right)
S[41mF[0mFH
FFFF
FFFF
HFFG

  (Right)
S[41mF[0mFH
FFFF
FFFF
HFFG

  (Right)
S[41mF[0mFH
FFFF
FFFF
HFFG

  (Right)
SF[41mF[0mH
FFFF
FFFF
HFFG

  (Left)
SFFH
FF[41mF[0mF
FFFF
HFFG

  (Right)
SF[41mF[0mH
FFFF
FFFF
HFFG

  (Left)
S[41mF[0mFH
FFFF
FFFF
HFFG

  (Right)
SFFH
F[41mF[0mFF
FFFF
HFFG

  (Right)
S[41mF[0mFH
FFFF
FFFF
HFFG

  (Right)
SFFH
F[41mF[0mFF
FFFF
HFFG

  (Right)
SFFH
FF[41mF[0mF
FFFF
HFFG

  (Right)
SF[41mF[0mH
FFFF
FFFF
HFFG

  (Left)
SFFH
FF[41mF[0mF
FFFF
HFFG

  (Right)
SFFH
FF