# Examen Corto 1
## Task Frozen Lake
### Integrantes:
- Diego Leiva       21752
- Maria Ramirez     21342
- Gustavo Gonzalez  21438

**Librerias Necesarias**

In [1]:
import gymnasium as gym
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
import numpy as np
import warnings

**Configuracion inicial**

In [2]:
warnings.simplefilter('ignore')
np.random.seed(42)

**Crear el entorno Frozen Lake**

Estados:
- S: starting point, seguro
- F: frozen surface, seguro
- H: hole, atrapado
- G: goal, seguro

Acciones:
- 0: Move left
- 1: Move down
- 2: Move right
- 3: Move up

Recompensas:
- Reach goal: +1
- Reach hole: 0
- Reach frozen: 0

Probabilidades:
- P(intended) = 1/3
- P(perpendicular direction) = 1/3
- P(perpendicular direction) = 1/3

In [3]:
# Crear un nuevo entorno
env = gym.make("FrozenLake-v1", 
               desc = generate_random_map(size=4),  # Generar un mapa aleatorio
               map_name = "4x4",                    # Mapa 4x4
               is_slippery = True,                  # Entorno resbaladizo
               render_mode="ansi")                  # Modo de renderizado
env.reset()

# Visualizar el entorno
print(env.render())


[41mS[0mFFF
FFFH
FFFF
FFFG



**Q-Learning**

In [4]:
# Inicializacion de la Q table
qtable = np.zeros((env.observation_space.n, env.action_space.n))

# Definicion de Hiperparametros:
episodes = 100000
alpha = 0.9                 # Tasa de aprendizaje
gamma = 0.9                 # Tasa de descuento
epsilon = 1.0               # Aletoriedad en la seleccion de acciones
epsilon_decay = 0.0001      # Tasa de decaiminto de epsilon

# Mostrar la tabla
print('Q-table antes del entrenamiento: ')
print(qtable)

Q-table antes del entrenamiento: 
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


**Entrenamiento del Agente**

In [5]:
# Por cada episodio, explorar el entorno
for i in range(episodes):
    state = env.reset()[0]
    terminated = False
    truncated = False

    # Mientras que el agente no caiga en un hoyo o alcanze la meta, continuar entrenando
    while not truncated and not terminated:
        # Generar un numero aleatorio entre 0 y 1
        rnd = np.random.random()

        # Si el aleatorio es menor a epsilon, tomar una accion aleatoria
        if rnd < epsilon:
            action = env.action_space.sample()
        # de lo contrario, tomar la accion con mayor valor
        else:
            action = np.argmax(qtable[state])

        # Ejecutar la accion y mover el agente
        new_state, reward, terminated, truncated, info = env.step(action)

        # Actualizar Q(s,a)
        qtable[state, action] = qtable[state, action] + alpha * (
            reward + gamma * np.max(qtable[new_state]) - qtable[state, action]
            )
        
        # Actualizar el estado actual
        state = new_state

    # Actualizar el valor de epsilon
    epsilon = max(epsilon - epsilon_decay, 0)

    if epsilon == 0:
        alpha = 0.0001
            
# Visualizar la Q-table actualizada
print('Q-table despues del entrenamiento:')
print(f"{qtable}\n")

Q-table despues del entrenamiento:
[[4.05470111e-02 4.07281315e-02 1.88145073e-01 4.21094627e-02]
 [4.39056696e-02 6.04556316e-02 2.08042149e-01 4.81737913e-02]
 [2.19413771e-01 3.92787758e-02 4.45510281e-02 3.88236884e-02]
 [8.15032126e-03 1.04699876e-04 8.45659030e-05 1.05106226e-01]
 [4.10274301e-02 5.77677336e-02 2.21107827e-01 4.16730961e-02]
 [4.94800620e-02 5.02155264e-02 2.61005150e-01 4.98381122e-02]
 [2.95868002e-01 6.10299049e-03 1.33866925e-02 1.23337755e-07]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.83938945e-02 5.82268375e-02 2.70135908e-01 4.41425478e-02]
 [6.10579710e-02 3.56676310e-01 5.89662002e-02 5.26678377e-02]
 [9.65945591e-02 4.98960964e-01 9.55185478e-02 9.64416389e-02]
 [7.53626818e-02 6.91127857e-01 9.09990986e-02 3.10826344e-02]
 [6.33726988e-02 6.78231294e-02 3.07360552e-01 6.30366640e-02]
 [8.66578066e-02 1.30465114e-01 4.17502607e-01 8.25024746e-02]
 [1.49993084e-01 1.50331711e-01 1.58447034e-01 6.10901001e-01]
 [0.00000000e+00 0.0

**Evaluacion del entrenamiento**

In [6]:
success = 0

# Evaluar la tasa de exito
for i in range(episodes):
    state = env.reset()[0]
    terminated = False
    truncated = False
    done = truncated and terminated

    while not done:
        action = np.argmax(qtable[state])

        new_state, reward, truncated, terminated, info = env.step(action)
        done = truncated and terminated

        state = new_state
        success += reward

# Obtener la tasa de exito
print(f"Tasa de exito: {round(success/episodes*100,2)}%")

Tasa de exito: 100.0%


**Visualizar la politica aprendida**

In [7]:
state = env.reset()[0]
terminated = False
truncated = False

sequence = []
print(env.render())

while not terminated and not truncated:
    if np.max(qtable[state]) > 0:
      action = np.argmax(qtable[state])
    else:
      action = env.action_space.sample()
    
    # Agregar la accion a la secuencia
    sequence.append(action)

    # Ejecutar la accion y mover el agente
    new_state, reward, terminated, truncated, info = env.step(action)

    # Actualizar el estado actual
    state = new_state

    # Visualizar el movimiento
    print(env.render())


# Definir el mapeo de acciones
action_mapping = {
    0: 'LEFT',
    1: 'DOWN',
    2: 'RIGHT',
    3: 'UP'
}

# Mapear los valores numericos a direcciones
mapped_sequence = [action_mapping[action] for action in sequence]

# Mostrar la secuencia
print(f"Secuencia: {mapped_sequence}")


[41mS[0mFFF
FFFH
FFFF
FFFG

  (Right)
SFFF
[41mF[0mFFH
FFFF
FFFG

  (Right)
[41mS[0mFFF
FFFH
FFFF
FFFG

  (Right)
[41mS[0mFFF
FFFH
FFFF
FFFG

  (Right)
S[41mF[0mFF
FFFH
FFFF
FFFG

  (Right)
S[41mF[0mFF
FFFH
FFFF
FFFG

  (Right)
S[41mF[0mFF
FFFH
FFFF
FFFG

  (Right)
SFFF
F[41mF[0mFH
FFFF
FFFG

  (Right)
S[41mF[0mFF
FFFH
FFFF
FFFG

  (Right)
SF[41mF[0mF
FFFH
FFFF
FFFG

  (Left)
S[41mF[0mFF
FFFH
FFFF
FFFG

  (Right)
SF[41mF[0mF
FFFH
FFFF
FFFG

  (Left)
SF[41mF[0mF
FFFH
FFFF
FFFG

  (Left)
SFFF
FF[41mF[0mH
FFFF
FFFG

  (Left)
SFFF
FFFH
FF[41mF[0mF
FFFG

  (Down)
SFFF
FFFH
FFF[41mF[0m
FFFG

  (Down)
SFFF
FFFH
FFFF
FFF[41mG[0m

Secuencia: ['RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'RIGHT', 'LEFT', 'RIGHT', 'LEFT', 'LEFT', 'LEFT', 'DOWN', 'DOWN']
