In [4]:
# Imports 
import numpy as np
from tqdm import trange,tqdm

In [10]:
# Constantes 
GOAL_SCORE = 100.0
ALPHA = 0.1
GAMMA = 0.95
EPSILON = 0.8
EPOCHS = 10000

# Variables
grid_rows = 3
grid_cols = 4

q_table = np.zeros((grid_rows, grid_cols, 4)) 

ACTIONS = ['up', 'down', 'left', 'right']

rewards = np.full((grid_rows, grid_cols), -1)
rewards[0, 3] = GOAL_SCORE # meta
obstacles = [[1,1]] # obstaculo

# 1.b
q_table_b = np.zeros((grid_rows, grid_cols, 4))
rewards_b = np.array([
    [-3., -2., -1., GOAL_SCORE],
    [-4., -100., -2., -1.],
    [-5., -4., -3., -2.]
])
rewards_b[0, 3] = GOAL_SCORE
obstacles_b = [[1,1]]

In [14]:
# Funciones del Entorno

def is_terminal_state(row, col, grid_rewards):
    """Verifica si el estado actual es terminal (meta)."""
    if grid_rewards[row, col] == GOAL_SCORE:
        return True
    return False

def get_random_location(grid_rows, grid_cols, obstacles, grid_rewards):
    """Obtiene una ubicación aleatoria válida que no sea terminal ni obstáculo."""
    while True:
        row = np.random.randint(grid_rows)
        col = np.random.randint(grid_cols)
        if not is_terminal_state(row, col, grid_rewards) and [row, col] not in obstacles:
            return row, col

def get_next_action(row, col, q_table, epsilon, drunkenness=0.0):
    """
    Selecciona la siguiente acción.
    Prioridad:
    1. Drunkenness (movimiento aleatorio involuntario/ruido).
    2. Exploración (epsilon-greedy).
    3. Explotación (mejor valor Q).
    """
    # 1. Drunkenness
    if drunkenness > 0.0 and np.random.random() < drunkenness:
        return np.random.randint(4)
    
    # 2. Exploración
    if np.random.random() < epsilon:
        return np.random.randint(4)

    # 3. Explotación
    return np.argmax(q_table[row, col])

def get_next_state(row, col, action_index, obstacles):
    """
    Calcula el nuevo estado (row, col) dado el estado actual y una ACCIÓN específica.
    Ya no decide la acción, solo ejecuta la física.
    """
    action = ACTIONS[action_index]
    new_row, new_col = row, col

    if action == 'up' and row > 0:
        new_row -= 1
    elif action == 'down' and row < grid_rows - 1:
        new_row += 1
    elif action == 'left' and col > 0:
        new_col -= 1
    elif action == 'right' and col < grid_cols - 1:
        new_col += 1

    # Choque con obstáculos
    if [new_row, new_col] in obstacles:
        return row, col
        
    return new_row, new_col

def get_shortest_path(start_row, start_col, q_table, grid_rewards, obstacles):
    """
    Obtiene el camino más corto aprendida por la Q-Table desde un inicio hasta la meta.
    Usa epsilon=0 para seguir la política óptima.
    """
    if is_terminal_state(start_row, start_col, grid_rewards):
        return []
    
    current_row, current_col = start_row, start_col
    path = [[current_row, current_col]]
    
    while not is_terminal_state(current_row, current_col, grid_rewards):
        # epsilon=0.0 para explotar la mejor ruta
        action_index = get_next_action(current_row, current_col, q_table, epsilon=0.0, drunkenness=0.0)
        current_row, current_col = get_next_state(
            current_row, current_col, action_index, obstacles
        )
        path.append([current_row, current_col])
        
    return path

In [15]:
def train(q_table, grid_rewards, obstacles, 
          alpha=ALPHA, gamma=GAMMA, 
          epsilon=EPSILON, epsilon_min=0.01, epsilon_decay=0.995,
          conv_threshold=0.001, patience=10,
          drunkenness=0.0):
    """
    Entrena el agente usando Q-Learning.
    
    Args:
        q_table: Tabla Q inicial.
        grid_rewards: Matriz de recompensas del entorno.
        obstacles: Lista de coordenadas de obstáculos.
        alpha, gamma, epsilon: Hiperparámetros de Q-Learning.
        drunkenness: Probabilidad de que el agente se mueva aleatoriamente (ruido).
    """
    
    # Snapshots para análisis
    snapshots = [q_table.copy()]
    reward_history = []
    
    epoch_limit = 30000
    total_epochs = 0
    stable_epochs = 0
    is_converged = False
    
    if drunkenness > 0.0:
        print(f"Iniciando entrenamiento (Drunkenness={drunkenness})...")
    else:
        print("Iniciando entrenamiento...")

    for epoch in trange(epoch_limit):
        total_epochs += 1
        
        # Estado inicial fijo en (2,0)
        current_row, current_col = 2, 0 
        
        total_reward = 0
        old_q_table = q_table.copy()
        
        while not is_terminal_state(current_row, current_col, grid_rewards):
            # Guardamos estado anterior para actualizar Q(s,a)
            old_row, old_col = current_row, current_col
            
            # 1. Elegir Acción (Política + Drunkenness (si hubiere))
            action_index = get_next_action(
                current_row, current_col, 
                q_table, epsilon, drunkenness
            )
            
            # 2. Ejecutar Movimiento
            current_row, current_col = get_next_state(
                current_row, current_col, 
                action_index, obstacles
            )
            
            # 3. Obtener Recompensa
            reward = grid_rewards[current_row, current_col]
            
            # 4. Actualizar Q-Value 
            # Q(s,a) = Q(s,a) + alpha * (R + gamma * max(Q(s',a')) - Q(s,a))
            
            old_q_value = q_table[old_row, old_col, action_index]
            best_next_q = np.max(q_table[current_row, current_col])
            
            temporal_difference = reward + (gamma * best_next_q) - old_q_value
            new_q_value = old_q_value + (alpha * temporal_difference)
            
            q_table[old_row, old_col, action_index] = new_q_value
            
            total_reward += reward
            
        # Fin del epoch
        
        # Chequeo de Convergencia
        delta = np.linalg.norm(q_table - old_q_table)
        reward_history.append(total_reward)
        
        # Decay Epsilon
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        
        # Snapshots periódicos
        snapshots.append(q_table.copy())
        
        if delta < conv_threshold:
            stable_epochs += 1
            if stable_epochs >= patience:
                print(f"Converged after {total_epochs} epochs.")
                is_converged = True
                snapshots.append(q_table.copy()) # Guardar el convergido
                break
        else:
            stable_epochs = 0
            
    # Asegurar snapshot final si no está
    if len(snapshots) == 0 or not np.array_equal(q_table, snapshots[-1]):
        snapshots.append(q_table.copy())
        
    # Selección de snapshots representativos para return
    # (Inicio, 1/3, 2/3, Final) - aproximado
    n_snaps = len(snapshots)
    indices = [0, int(n_snaps*0.33), int(n_snaps*0.66), -1]
    # Filtrar índices válidos y únicos (por si n_snaps es pequeño)
    indices = sorted(list(set([i for i in indices if i < n_snaps])))
    snapshots_output = [snapshots[i] for i in indices]
    
    # Métricas finales
    avg_reward_final = np.mean(reward_history[-100:]) if len(reward_history) >= 100 else np.mean(reward_history)
    avg_reward_total = np.mean(reward_history)
    
    return {
        "snapshots": snapshots_output,
        "avg_reward_final": avg_reward_final,
        "avg_reward_total": avg_reward_total,
        "total_epochs": total_epochs,
        "is_converged": is_converged,
        "reward_history": reward_history
    }

In [30]:
# ==========================================
# Ejecución Ejercicio 1.a (Entorno basico)
# ==========================================

# 1. Reiniciamos la Q-Table para asegurar entrenamiento desde cero
q_table = np.zeros((grid_rows, grid_cols, 4)) 

# 2. Entrenamos
print("Entrenando Agente 1.a...")
results_1a = train(
    q_table=q_table, 
    grid_rewards=rewards, 
    obstacles=obstacles, 
    alpha=ALPHA,
    gamma=GAMMA,
    epsilon=EPSILON,
    conv_threshold=0.0001, 
    patience=20,
    drunkenness=0.0 # Agente sobrio
)

# 3. Resultados
print(f"\nEntrenamiento finalizado en {results_1a['total_epochs']} épocas.")
print(f"¿Convergió?: {results_1a['is_converged']}")
print(f"Recompensa Promedio Final: {results_1a['avg_reward_final']:.2f}")

# 4. Verificación del Camino Aprendido
print("\nCamino Óptimo Aprendido (desde [2,0]):")
path_1a = get_shortest_path(2, 0, q_table, rewards, obstacles)
print(path_1a)

# 5. Visualización de la Q-Table
# display(q_table) 

Entrenando Agente 1.a...
Iniciando entrenamiento...


  2%|▏         | 591/30000 [00:00<00:06, 4331.45it/s]

Converged after 592 epochs.

Entrenamiento finalizado en 592 épocas.
¿Convergió?: True
Recompensa Promedio Final: 95.72

Camino Óptimo Aprendido (desde [2,0]):
[[2, 0], [2, 1], [2, 2], [2, 3], [1, 3], [0, 3]]





In [None]:
# ==========================================
# Ejecución Ejercicio 1.b (Entorno variado)
# ==========================================

print("=== Configurando Entorno 1.b ===")

# Definición del entorno 1.b
rewards_b = np.array([
    [-3., -2., -1., GOAL_SCORE],
    [-4., -100., -2., -1.],  # Zona de peligro (le ponemos -100 pero en realidad se evita con obstacles)
    [-5., -4., -3., -2.]
])
rewards_b[0, 3] = GOAL_SCORE # Asegurar meta
obstacles_b = [[1, 1]]       # Mismo obstáculo

print("Recompensas 1.b:")
print(rewards_b)

# 1. Reiniciamos Q-Table para 1.b
# Usamos grid_rows y grid_cols (minúsculas)
q_table_b = np.zeros((grid_rows, grid_cols, 4))

# 2. Entrenamos
print("\nEntrenando Agente 1.b...")
results_1b = train(
    q_table=q_table_b, 
    grid_rewards=rewards_b, 
    obstacles=obstacles_b, 
    alpha=ALPHA,
    gamma=GAMMA,
    epsilon=EPSILON,
    conv_threshold=0.0001, 
    patience=20,
    drunkenness=0.0
)

# 3. Resultados
print(f"\nEntrenamiento 1.b finalizado en {results_1b['total_epochs']} épocas.")
print(f"¿Convergió?: {results_1b['is_converged']}")
print(f"Recompensa Promedio Final: {results_1b['avg_reward_final']:.2f}")

# 4. Verificación del Camino Aprendido
print("\nCamino Óptimo Aprendido 1.b (desde [2,0]):")
path_1b = get_shortest_path(2, 0, q_table_b, rewards_b, obstacles_b)
print(path_1b)

=== Configurando Entorno 1.b ===
Recompensas 1.b:
[[  -3.   -2.   -1.  100.]
 [  -4. -100.   -2.   -1.]
 [  -5.   -4.   -3.   -2.]]

Entrenando Agente 1.b...
Iniciando entrenamiento...


  3%|▎         | 779/30000 [00:00<00:04, 6577.90it/s]

Converged after 780 epochs.

Entrenamiento 1.b finalizado en 780 épocas.
¿Convergió?: True
Recompensa Promedio Final: 89.34

Camino Óptimo Aprendido 1.b (desde [2,0]):
[[2, 0], [1, 0], [0, 0], [0, 1], [0, 2], [0, 3]]





**Drunken sailor**

In [None]:
# ==========================================
# Ejecución Ejercicio 1.c (Entorno variado despues de 10L de sangria)
# ==========================================

print("=== Entrenando Drunken Sailor (1.c) ===")

drunkenness_level = 0.01 # 1% de probabilidad de resbalar/acción aleatoria (99% de acción correcta)

# 1. Configuración
q_table_drunk = np.zeros((grid_rows, grid_cols, 4))

# 2. Entrenamiento
results_drunk = train(
    q_table_drunk, 
    rewards_b, 
    obstacles_b, 
    conv_threshold=0.0001, 
    patience=20, 
    drunkenness=drunkenness_level
)

# 3. Resultados Individuales
print(f"\nEntrenamiento (Drunkenness={drunkenness_level}) finalizado en {results_drunk['total_epochs']} épocas.")
print(f"¿Convergió?: {results_drunk['is_converged']}")
print(f"Recompensa Final: {results_drunk['avg_reward_final']:.2f}")

print("\nCamino aprendido por el agente borracho:")
path_drunk = get_shortest_path(2, 0, q_table_drunk, rewards_b, obstacles_b)
print(path_drunk)

=== Entrenando Drunken Sailor (1.c) ===
Iniciando entrenamiento (Drunkenness=0.01)...


  2%|▏         | 717/30000 [00:00<00:04, 5918.49it/s]

Converged after 718 epochs.

Entrenamiento (Drunkenness=0.01) finalizado en 718 épocas.
¿Convergió?: True
Recompensa Final: 89.46

Camino aprendido por el agente borracho:
[[2, 0], [1, 0], [0, 0], [0, 1], [0, 2], [0, 3]]





In [None]:
# ==========================================
# Comparación: Impacto del "Drunkenness"
# ==========================================

runs_comparison = 10 
epochs_sober_list = []
epochs_drunk_list = []

print(f"Iniciando comparativa ({runs_comparison} runs por configuración)...")

t = trange(runs_comparison, desc="Runs")
for _ in t:
    # A. Normal/sobrio (Drunkenness = 0.0)
    qt_sober = np.zeros((grid_rows, grid_cols, 4))
    res_s = train(
        qt_sober, rewards_b, obstacles_b,
        alpha=0.1, gamma=0.99, epsilon=0.8,
        conv_threshold=0.001, patience=20, drunkenness=0.0
    )
    if res_s['is_converged']:
        epochs_sober_list.append(res_s['total_epochs'])
        
    # B. Borracho (Drunkenness = 0.01)
    qt_drunk = np.zeros((grid_rows, grid_cols, 4))
    res_d = train(
        qt_drunk, rewards_b, obstacles_b, 
        alpha=0.1, gamma=0.99, epsilon=0.8,
        conv_threshold=0.001, patience=20, drunkenness=0.01
    )
    if res_d['is_converged']:
        epochs_drunk_list.append(res_d['total_epochs'])

# Comparativa 
mean_sober = np.mean(epochs_sober_list)
mean_drunk = np.mean(epochs_drunk_list)
impact_pct = ((mean_drunk - mean_sober) / mean_sober) * 100

print("\n" + "="*40)
print(f"RESULTADOS (Promedio de {runs_comparison} runs)")
print("="*40)
print(f"Sobrio (0.0):   {mean_sober:.1f} epochs")
print(f"Borracho (0.01): {mean_drunk:.1f} epochs")
print("-" * 40)
print(f"Impacto de la borrachera: +{impact_pct:.1f}% de tiempo para converger")

Iniciando comparativa (10 runs por configuración)...


Runs:   0%|          | 0/10 [00:00<?, ?it/s]

Iniciando entrenamiento...


  2%|▏         | 738/30000 [00:00<00:05, 5293.21it/s]


Converged after 739 epochs.
Iniciando entrenamiento (Drunkenness=0.01)...


  3%|▎         | 779/30000 [00:00<00:05, 5061.94it/s]
Runs:  10%|█         | 1/10 [00:00<00:02,  3.26it/s]

Converged after 780 epochs.
Iniciando entrenamiento...


  3%|▎         | 776/30000 [00:00<00:05, 4937.27it/s]


Converged after 777 epochs.
Iniciando entrenamiento (Drunkenness=0.01)...


  3%|▎         | 776/30000 [00:00<00:05, 5797.16it/s]
Runs:  20%|██        | 2/10 [00:00<00:02,  3.28it/s]

Converged after 777 epochs.
Iniciando entrenamiento...


  2%|▏         | 635/30000 [00:00<00:05, 5640.32it/s]


Converged after 636 epochs.
Iniciando entrenamiento (Drunkenness=0.01)...


  2%|▏         | 619/30000 [00:00<00:06, 4511.73it/s]
Runs:  30%|███       | 3/10 [00:00<00:02,  3.50it/s]

Converged after 620 epochs.
Iniciando entrenamiento...


  2%|▏         | 579/30000 [00:00<00:05, 5291.89it/s]


Converged after 580 epochs.
Iniciando entrenamiento (Drunkenness=0.01)...


  2%|▏         | 614/30000 [00:00<00:07, 4109.46it/s]
Runs:  40%|████      | 4/10 [00:01<00:01,  3.58it/s]

Converged after 615 epochs.
Iniciando entrenamiento...


  2%|▏         | 629/30000 [00:00<00:06, 4821.60it/s]


Converged after 630 epochs.
Iniciando entrenamiento (Drunkenness=0.01)...


  3%|▎         | 817/30000 [00:00<00:06, 4420.46it/s]
Runs:  50%|█████     | 5/10 [00:01<00:01,  3.35it/s]

Converged after 818 epochs.
Iniciando entrenamiento...


  2%|▏         | 623/30000 [00:00<00:05, 4910.76it/s]


Converged after 624 epochs.
Iniciando entrenamiento (Drunkenness=0.01)...


  2%|▏         | 721/30000 [00:00<00:05, 4977.38it/s]
Runs:  60%|██████    | 6/10 [00:01<00:01,  3.40it/s]

Converged after 722 epochs.
Iniciando entrenamiento...


  2%|▏         | 722/30000 [00:00<00:06, 4735.58it/s]


Converged after 723 epochs.
Iniciando entrenamiento (Drunkenness=0.01)...


  2%|▏         | 621/30000 [00:00<00:07, 4146.77it/s]
Runs:  70%|███████   | 7/10 [00:02<00:00,  3.32it/s]

Converged after 622 epochs.
Iniciando entrenamiento...


  2%|▏         | 674/30000 [00:00<00:06, 4234.72it/s]


Converged after 675 epochs.
Iniciando entrenamiento (Drunkenness=0.01)...


  2%|▏         | 685/30000 [00:00<00:07, 3975.25it/s]
Runs:  80%|████████  | 8/10 [00:02<00:00,  3.17it/s]

Converged after 686 epochs.
Iniciando entrenamiento...


  2%|▏         | 579/30000 [00:00<00:06, 4270.70it/s]


Converged after 580 epochs.
Iniciando entrenamiento (Drunkenness=0.01)...


  2%|▏         | 681/30000 [00:00<00:06, 4374.15it/s]
Runs:  90%|█████████ | 9/10 [00:02<00:00,  3.20it/s]

Converged after 682 epochs.
Iniciando entrenamiento...


  2%|▏         | 656/30000 [00:00<00:05, 5069.99it/s]


Converged after 657 epochs.
Iniciando entrenamiento (Drunkenness=0.01)...


  3%|▎         | 795/30000 [00:00<00:05, 5450.09it/s]
Runs: 100%|██████████| 10/10 [00:03<00:00,  3.31it/s]

Converged after 796 epochs.

RESULTADOS (Promedio de 10 runs)
Sobrio (0.0):   662.1 epochs
Borracho (0.1): 711.8 epochs
----------------------------------------
Impacto del Ruido: +7.5% de tiempo para converger



