<a href="https://colab.research.google.com/github/LopezGarciaAxelSteven/ALGORITMOS-GEN-TICOS/blob/main/Aprendizaje_por_refuerzo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
from collections import defaultdict
import ipywidgets as widgets
from IPython.display import display, clear_output
import time

class FrozenLakeEnvironment:
    """
    Entorno Frozen Lake personalizable:
    S = Inicio (Start)
    F = Congelado (Frozen) - Seguro para caminar
    H = Hoyo (Hole) - Termina el juego
    G = Objetivo (Goal) - Objetivo a alcanzar
    """

    def __init__(self, size=4, hole_probability=0.1, custom_grid=None):
        self.size = size
        self.hole_probability = hole_probability

        if custom_grid is not None:
            self.grid = custom_grid
        else:
            self.grid = self._create_random_grid()

        self.start_pos = (0, 0)
        self.goal_pos = (size-1, size-1)
        self.current_pos = self.start_pos
        self.done = False

        # Acciones: 0=Arriba, 1=Derecha, 2=Abajo, 3=Izquierda
        self.actions = [(-1, 0), (0, 1), (1, 0), (0, -1)]
        self.action_names = ['↑', '→', '↓', '←']

    def _create_random_grid(self):
        """Crea un grid aleatorio basado en probabilidad de hoyos"""
        grid = [['F' for _ in range(self.size)] for _ in range(self.size)]

        # Establecer inicio y objetivo
        grid[0][0] = 'S'
        grid[self.size-1][self.size-1] = 'G'

        # Añadir hoyos aleatoriamente
        for i in range(self.size):
            for j in range(self.size):
                if (i, j) not in [(0, 0), (self.size-1, self.size-1)]:
                    if random.random() < self.hole_probability:
                        grid[i][j] = 'H'

        return grid

    @classmethod
    def from_pattern(cls, pattern_name):
        """Crea entornos predefinidos"""
        patterns = {
            'clasico': [
                ['S', 'F', 'F', 'F'],
                ['F', 'H', 'F', 'H'],
                ['F', 'F', 'F', 'H'],
                ['H', 'F', 'F', 'G']
            ],
            'facil': [
                ['S', 'F', 'F', 'F'],
                ['F', 'F', 'F', 'F'],
                ['F', 'F', 'H', 'F'],
                ['F', 'F', 'F', 'G']
            ],
            'dificil': [
                ['S', 'H', 'F', 'F'],
                ['F', 'H', 'H', 'F'],
                ['F', 'F', 'H', 'H'],
                ['H', 'F', 'F', 'G']
            ],
            'laberinto': [
                ['S', 'F', 'H', 'F', 'F'],
                ['H', 'F', 'H', 'F', 'H'],
                ['F', 'F', 'F', 'F', 'F'],
                ['F', 'H', 'H', 'F', 'F'],
                ['F', 'F', 'F', 'F', 'G']
            ]
        }

        if pattern_name in patterns:
            grid = patterns[pattern_name]
            size = len(grid)
            env = cls(size=size, custom_grid=grid)
            return env
        else:
            raise ValueError(f"Patrón '{pattern_name}' no reconocido")

    def reset(self):
        """Reinicia el entorno"""
        self.current_pos = self.start_pos
        self.done = False
        return self._get_state()

    def _get_state(self):
        """Convierte posición a estado numérico"""
        return self.current_pos[0] * self.size + self.current_pos[1]

    def step(self, action):
        """Ejecuta una acción y retorna (nuevo_estado, recompensa, terminado)"""
        if self.done:
            return self._get_state(), 0, True

        # Calcular nueva posición
        new_row = self.current_pos[0] + self.actions[action][0]
        new_col = self.current_pos[1] + self.actions[action][1]

        # Verificar límites
        if (0 <= new_row < self.size) and (0 <= new_col < self.size):
            self.current_pos = (new_row, new_col)

        # Calcular recompensa
        cell_type = self.grid[self.current_pos[0]][self.current_pos[1]]

        if cell_type == 'H':  # Hoyo
            reward = -10
            self.done = True
        elif cell_type == 'G':  # Objetivo
            reward = 100
            self.done = True
        else:  # Casilla normal
            reward = -1  # Penalización por paso

        return self._get_state(), reward, self.done

    def render(self):
        """Visualiza el entorno actual"""
        print("\\nEntorno actual:")
        for i, row in enumerate(self.grid):
            display_row = []
            for j, cell in enumerate(row):
                if (i, j) == self.current_pos:
                    display_row.append(f'[{cell}]')
                else:
                    display_row.append(f' {cell} ')
            print(''.join(display_row))
        print()

    def get_grid_info(self):
        """Retorna información sobre el grid"""
        holes = sum(row.count('H') for row in self.grid)
        total_cells = self.size * self.size
        return {
            'size': self.size,
            'holes': holes,
            'safe_cells': total_cells - holes - 2,  # -2 por inicio y objetivo
            'difficulty': 'Fácil' if holes < 3 else 'Medio' if holes < 6 else 'Difícil'
        }


class QLearningAgent:
    """Agente Q-Learning con parámetros configurables"""

    def __init__(self, n_states, n_actions, learning_rate=0.1, discount_factor=0.95,
                 epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995):
        self.n_states = n_states
        self.n_actions = n_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.initial_epsilon = epsilon

        # Inicializar tabla Q
        self.q_table = np.zeros((n_states, n_actions))

        # Métricas de entrenamiento
        self.training_scores = []
        self.training_episodes = []
        self.exploration_steps = 0
        self.exploitation_steps = 0

    def choose_action(self, state):
        """Selecciona acción usando estrategia epsilon-greedy"""
        if random.random() < self.epsilon:
            self.exploration_steps += 1
            return random.randint(0, self.n_actions - 1)  # Exploración
        else:
            self.exploitation_steps += 1
            return np.argmax(self.q_table[state])  # Explotación

    def update_q_table(self, state, action, reward, next_state):
        """Actualiza la tabla Q usando la ecuación de Bellman"""
        current_q = self.q_table[state, action]
        max_next_q = np.max(self.q_table[next_state])

        # Ecuación Q-Learning: Q(s,a) = Q(s,a) + α[r + γ*max(Q(s',a')) - Q(s,a)]
        new_q = current_q + self.learning_rate * (reward + self.discount_factor * max_next_q - current_q)
        self.q_table[state, action] = new_q

    def decay_epsilon(self):
        """Reduce epsilon para disminuir exploración con el tiempo"""
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def get_policy(self):
        """Extrae la política óptima de la tabla Q"""
        return np.argmax(self.q_table, axis=1)

    def reset_training(self):
        """Reinicia el entrenamiento"""
        self.q_table = np.zeros((self.n_states, self.n_actions))
        self.epsilon = self.initial_epsilon
        self.training_scores = []
        self.exploration_steps = 0
        self.exploitation_steps = 0

    def get_stats(self):
        """Retorna estadísticas del agente"""
        total_steps = self.exploration_steps + self.exploitation_steps
        exploration_rate = self.exploration_steps / total_steps if total_steps > 0 else 0

        return {
            'exploration_rate': exploration_rate,
            'total_steps': total_steps,
            'epsilon': self.epsilon,
            'q_table_max': np.max(self.q_table),
            'q_table_min': np.min(self.q_table)
        }


def create_interactive_controls():
    """Crea controles interactivos para configurar el experimento"""

    # Configuración del entorno
    env_type = widgets.RadioButtons(
        options=[('Aleatorio', 'random'), ('Clásico', 'clasico'),
                ('Fácil', 'facil'), ('Difícil', 'dificil'), ('Laberinto 5x5', 'laberinto')],
        value='clasico',
        description='Tipo de entorno:',
        style={'description_width': 'initial'}
    )

    grid_size = widgets.IntSlider(
        value=4, min=3, max=8, step=1,
        description='Tamaño (solo aleatorio):',
        style={'description_width': 'initial'}
    )

    hole_prob = widgets.FloatSlider(
        value=0.15, min=0.0, max=0.4, step=0.05,
        description='Prob. hoyos (aleatorio):',
        style={'description_width': 'initial'}
    )

    # Configuración del agente
    learning_rate = widgets.FloatSlider(
        value=0.1, min=0.01, max=0.5, step=0.01,
        description='Tasa de aprendizaje (α):',
        style={'description_width': 'initial'}
    )

    discount = widgets.FloatSlider(
        value=0.95, min=0.5, max=0.99, step=0.01,
        description='Factor descuento (γ):',
        style={'description_width': 'initial'}
    )

    epsilon = widgets.FloatSlider(
        value=1.0, min=0.1, max=1.0, step=0.1,
        description='Epsilon inicial:',
        style={'description_width': 'initial'}
    )

    epsilon_decay = widgets.FloatSlider(
        value=0.995, min=0.990, max=0.999, step=0.001,
        description='Decaimiento epsilon:',
        style={'description_width': 'initial'}
    )

    # Configuración de entrenamiento
    episodes = widgets.IntSlider(
        value=1000, min=100, max=5000, step=100,
        description='Episodios:',
        style={'description_width': 'initial'}
    )

    test_episodes = widgets.IntSlider(
        value=3, min=1, max=10, step=1,
        description='Episodios de prueba:',
        style={'description_width': 'initial'}
    )

    show_progress = widgets.Checkbox(
        value=True,
        description='Mostrar progreso'
    )

    # Botones
    train_button = widgets.Button(
        description='Entrenar Agente',
        button_style='success',
        layout=widgets.Layout(width='200px', height='40px')
    )

    test_button = widgets.Button(
        description='Probar Agente',
        button_style='info',
        layout=widgets.Layout(width='200px', height='40px')
    )

    reset_button = widgets.Button(
        description='Reiniciar',
        button_style='warning',
        layout=widgets.Layout(width='200px', height='40px')
    )

    return {
        'env_type': env_type,
        'grid_size': grid_size,
        'hole_prob': hole_prob,
        'learning_rate': learning_rate,
        'discount': discount,
        'epsilon': epsilon,
        'epsilon_decay': epsilon_decay,
        'episodes': episodes,
        'test_episodes': test_episodes,
        'show_progress': show_progress,
        'train_button': train_button,
        'test_button': test_button,
        'reset_button': reset_button
    }


def train_agent_interactive(env, agent, episodes=1000, show_progress=True):
    """Entrena el agente con visualización en tiempo real"""

    scores = []
    success_count = 0

    if show_progress:
        print("INICIANDO ENTRENAMIENTO...")
        print(f"Configuración: {episodes} episodios")
        print(f"Parámetros: α={agent.learning_rate}, γ={agent.discount_factor}, ε₀={agent.initial_epsilon}")
        print("=" * 60)

    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        steps = 0

        while not env.done and steps < 200:
            action = agent.choose_action(state)
            next_state, reward, done = env.step(action)
            agent.update_q_table(state, action, reward, next_state)

            state = next_state
            total_reward += reward
            steps += 1

        scores.append(total_reward)
        agent.decay_epsilon()

        # Contar éxitos
        if env.grid[env.current_pos[0]][env.current_pos[1]] == 'G':
            success_count += 1

        # Mostrar progreso
        if show_progress and (episode + 1) % (episodes // 10) == 0:
            recent_avg = np.mean(scores[-100:]) if len(scores) >= 100 else np.mean(scores)
            success_rate = success_count / (episode + 1)
            print(f"Episodio {episode + 1:4d}/{episodes} | "
                  f"Puntuación: {recent_avg:6.2f} | "
                  f"Éxito: {success_rate:.1%} | "
                  f"ε: {agent.epsilon:.3f}")

    if show_progress:
        print("=" * 60)
        print("ENTRENAMIENTO COMPLETADO")

        # Estadísticas finales
        final_success_rate = success_count / episodes
        final_avg_score = np.mean(scores[-100:]) if len(scores) >= 100 else np.mean(scores)
        stats = agent.get_stats()

        print(f"Puntuación promedio final: {final_avg_score:.2f}")
        print(f"Tasa de éxito: {final_success_rate:.1%}")
        print(f"Exploración vs Explotación: {stats['exploration_rate']:.1%} / {1-stats['exploration_rate']:.1%}")
        print(f"Rango valores Q: [{stats['q_table_min']:.2f}, {stats['q_table_max']:.2f}]")

    return scores


def visualize_complete_results(scores, agent, env):
    """Visualización completa de resultados"""

    fig, axes = plt.subplots(2, 3, figsize=(18, 12))

    # 1. Puntuaciones durante entrenamiento
    axes[0,0].plot(scores, alpha=0.6, color='blue', linewidth=0.5)
    if len(scores) >= 100:
        window = min(100, len(scores)//10)
        moving_avg = [np.mean(scores[i:i+window]) for i in range(len(scores)-window+1)]
        axes[0,0].plot(range(window-1, len(scores)), moving_avg, 'red', linewidth=2,
                      label=f'Promedio móvil ({window})')
        axes[0,0].legend()

    axes[0,0].set_title('Progreso del Entrenamiento')
    axes[0,0].set_xlabel('Episodio')
    axes[0,0].set_ylabel('Puntuación')
    axes[0,0].grid(True, alpha=0.3)

    # 2. Histograma de puntuaciones
    axes[0,1].hist(scores, bins=30, alpha=0.7, color='green', edgecolor='black')
    axes[0,1].axvline(np.mean(scores), color='red', linestyle='--',
                     label=f'Media: {np.mean(scores):.1f}')
    axes[0,1].set_title('Distribución de Puntuaciones')
    axes[0,1].set_xlabel('Puntuación')
    axes[0,1].set_ylabel('Frecuencia')
    axes[0,1].legend()
    axes[0,1].grid(True, alpha=0.3)

    # 3. Tabla Q como heatmap
    im = axes[0,2].imshow(agent.q_table, cmap='RdYlBu', aspect='auto')
    axes[0,2].set_title('Tabla Q (Estados × Acciones)')
    axes[0,2].set_xlabel('Acciones [↑ → ↓ ←]')
    axes[0,2].set_ylabel('Estados')
    plt.colorbar(im, ax=axes[0,2])

    # 4. Entorno original
    env_display = np.zeros((env.size, env.size))
    for i in range(env.size):
        for j in range(env.size):
            if env.grid[i][j] == 'H':
                env_display[i][j] = -1  # Hoyo
            elif env.grid[i][j] == 'G':
                env_display[i][j] = 1   # Objetivo
            elif env.grid[i][j] == 'S':
                env_display[i][j] = 0.5 # Inicio

    axes[1,0].imshow(env_display, cmap='RdYlGn', vmin=-1, vmax=1)
    axes[1,0].set_title('Entorno')

    # Añadir etiquetas
    for i in range(env.size):
        for j in range(env.size):
            axes[1,0].text(j, i, env.grid[i][j], ha='center', va='center',
                          fontsize=12, fontweight='bold',
                          color='white' if env.grid[i][j] == 'H' else 'black')
    axes[1,0].set_xticks([])
    axes[1,0].set_yticks([])

    # 5. Política aprendida
    policy = agent.get_policy()
    policy_grid = policy.reshape(env.size, env.size)

    axes[1,1].imshow(policy_grid, cmap='tab10', vmin=0, vmax=3)
    axes[1,1].set_title('Política Óptima')

    action_symbols = ['↑', '→', '↓', '←']
    for i in range(env.size):
        for j in range(env.size):
            state = i * env.size + j
            action = policy[state]
            if env.grid[i][j] != 'H':
                axes[1,1].text(j, i, action_symbols[action], ha='center', va='center',
                              fontsize=14, fontweight='bold', color='white')
            else:
                axes[1,1].text(j, i, '💀', ha='center', va='center', fontsize=12)

    axes[1,1].set_xticks([])
    axes[1,1].set_yticks([])

    # 6. Valores Q por estado
    max_q_values = np.max(agent.q_table, axis=1).reshape(env.size, env.size)
    im2 = axes[1,2].imshow(max_q_values, cmap='viridis')
    axes[1,2].set_title('Valores Q Máximos por Estado')

    # Añadir valores como texto
    for i in range(env.size):
        for j in range(env.size):
            state = i * env.size + j
            max_q = np.max(agent.q_table[state])
            axes[1,2].text(j, i, f'{max_q:.1f}', ha='center', va='center',
                          fontsize=10, color='white' if max_q < 0 else 'black')

    axes[1,2].set_xticks([])
    axes[1,2].set_yticks([])
    plt.colorbar(im2, ax=axes[1,2])

    plt.tight_layout()
    plt.show()

    # Mostrar tabla Q detallada
    print("\\n" + "="*80)
    print("TABLA Q DETALLADA")
    print("="*80)
    action_symbols = ['↑', '→', '↓', '←']
    print(f"{'Estado':<8} {'Pos':<8} {'Tipo':<6} {'↑':<8} {'→':<8} {'↓':<8} {'←':<8} {'Mejor':<8}")
    print("-" * 80)

    for i in range(agent.n_states):
        row, col = i // env.size, i % env.size
        cell_type = env.grid[row][col]
        values = agent.q_table[i]
        best_action = np.argmax(values)

        print(f"{i:<8} ({row},{col})<4 {cell_type:<6} "
              f"{values[0]:<8.2f} {values[1]:<8.2f} {values[2]:<8.2f} {values[3]:<8.2f} "
              f"{action_symbols[best_action]:<8}")


def test_agent_interactive(env, agent, episodes=3):
    """Prueba interactiva del agente"""

    print("\\n" + "🧪" + "="*60)
    print("EVALUACIÓN DEL AGENTE ENTRENADO")
    print("="*62)

    original_epsilon = agent.epsilon
    agent.epsilon = 0  # Modo explotación pura

    results = []

    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        steps = 0
        path = [env.current_pos]
        actions_taken = []

        print(f"\\n EPISODIO {episode + 1}")
        print("-" * 30)

        # Mostrar estado inicial
        env.render()

        while not env.done and steps < 100:
            action = agent.choose_action(state)
            actions_taken.append(env.action_names[action])

            next_state, reward, done = env.step(action)
            total_reward += reward
            steps += 1
            path.append(env.current_pos)

            if steps <= 10:  # Mostrar primeros pasos detalladamente
                print(f"Paso {steps}: {env.action_names[action]} → Pos{env.current_pos} → R={reward}")
            elif done:
                print(f"... [pasos {steps-10}-{steps}] ...")
                print(f"Paso {steps}: {env.action_names[action]} → Pos{env.current_pos} → R={reward}")

            state = next_state

        # Estado final
        env.render()

        success = env.grid[env.current_pos[0]][env.current_pos[1]] == 'G'
        results.append({
            'episode': episode + 1,
            'success': success,
            'steps': steps,
            'reward': total_reward,
            'path_length': len(path)
        })

        status = "✅ ÉXITO" if success else "❌ FALLO"
        print(f"\\n{status} | Pasos: {steps} | Recompensa: {total_reward}")
        print(f"Camino: {' → '.join(map(str, path[:10]))}" +
              (f" → ... → {path[-1]}" if len(path) > 10 else ""))
        print(f"Acciones: {' '.join(actions_taken[:10])}" +
              (" ..." if len(actions_taken) > 10 else ""))

    agent.epsilon = original_epsilon

    # Resumen final
    successes = sum(r['success'] for r in results)
    avg_steps = np.mean([r['steps'] for r in results])
    avg_reward = np.mean([r['reward'] for r in results])

    print("\\n" + "📊" + "="*40)
    print("RESUMEN DE EVALUACIÓN")
    print("="*42)
    print(f"Tasa de éxito: {successes}/{episodes} ({successes/episodes:.1%})")
    print(f"Pasos promedio: {avg_steps:.1f}")
    print(f"Recompensa promedio: {avg_reward:.1f}")

    if successes > 0:
        successful_results = [r for r in results if r['success']]
        optimal_steps = np.mean([r['steps'] for r in successful_results])
        print(f"Pasos promedio (éxitos): {optimal_steps:.1f}")

    return results


def create_experiment_interface():
    """Crea la interfaz completa del experimento"""

    print("LABORATORIO DE APRENDIZAJE POR REFUERZO")
    print("=" * 60)
    print("Configura los parámetros y entrena tu agente Q-Learning")
    print("=" * 60)

    controls = create_interactive_controls()

    # Usar un diccionario para mantener el estado (más confiable que variables globales)
    experiment_state = {
        'env': None,
        'agent': None,
        'trained': False,
        'scores': None
    }

    def on_train_clicked(b):
        clear_output(wait=True)

        try:
            # Crear entorno
            if controls['env_type'].value == 'random':
                experiment_state['env'] = FrozenLakeEnvironment(
                    size=controls['grid_size'].value,
                    hole_probability=controls['hole_prob'].value
                )
            else:
                experiment_state['env'] = FrozenLakeEnvironment.from_pattern(controls['env_type'].value)

            # Mostrar información del entorno
            info = experiment_state['env'].get_grid_info()
            print(f"ENTORNO CREADO")
            print(f"Tamaño: {info['size']}×{info['size']} | Hoyos: {info['holes']} | Dificultad: {info['difficulty']}")
            experiment_state['env'].render()

            # Crear agente
            n_states = experiment_state['env'].size * experiment_state['env'].size
            n_actions = len(experiment_state['env'].actions)

            experiment_state['agent'] = QLearningAgent(
                n_states=n_states,
                n_actions=n_actions,
                learning_rate=controls['learning_rate'].value,
                discount_factor=controls['discount'].value,
                epsilon=controls['epsilon'].value,
                epsilon_decay=controls['epsilon_decay'].value
            )

            print("AGENTE CREADO")
            print(f"Estados: {n_states} | Acciones: {n_actions}")

            # Entrenar
            print("\\nINICIANDO ENTRENAMIENTO...")
            experiment_state['scores'] = train_agent_interactive(
                experiment_state['env'], experiment_state['agent'],
                episodes=controls['episodes'].value,
                show_progress=controls['show_progress'].value
            )

            # Marcar como entrenado
            experiment_state['trained'] = True

            print("\\nENTRENAMIENTO COMPLETADO")
            print("🧪 Ahora puedes usar el botón 'Probar Agente' para evaluar el rendimiento")

            # Visualizar
            visualize_complete_results(experiment_state['scores'], experiment_state['agent'], experiment_state['env'])

            # Habilitar botón de prueba
            controls['test_button'].disabled = False
            controls['test_button'].button_style = 'info'
            controls['test_button'].description = '🧪 Probar Agente ✓'

        except Exception as e:
            print(f"Error durante el entrenamiento: {str(e)}")
            experiment_state['trained'] = False

        # Mostrar controles de nuevo
        display_controls()

    def on_test_clicked(b):
        if not experiment_state['trained'] or experiment_state['env'] is None or experiment_state['agent'] is None:
            print("Primero entrena un agente!")
            print("Usa el botón 'Entrenar Agente' para crear y entrenar un agente.")
            return

        clear_output(wait=True)
        print("🧪 EVALUANDO AGENTE ENTRENADO...")
        print("="*50)

        try:
            # Probar agente
            results = test_agent_interactive(
                experiment_state['env'],
                experiment_state['agent'],
                controls['test_episodes'].value
            )

            print("\\nEVALUACIÓN COMPLETADA")

        except Exception as e:
            print(f"Error durante las pruebas: {str(e)}")

        display_controls()

    def on_reset_clicked(b):
        clear_output(wait=True)

        # Reiniciar estado
        experiment_state['env'] = None
        experiment_state['agent'] = None
        experiment_state['trained'] = False
        experiment_state['scores'] = None

        # Reiniciar botones
        controls['test_button'].disabled = True
        controls['test_button'].button_style = 'warning'
        controls['test_button'].description = 'Probar Agente (Entrena primero)'

        controls['train_button'].button_style = 'success'
        controls['train_button'].description = 'Entrenar Agente'

        print("Sistema reiniciado completamente")
        print("Configura los parámetros y entrena un nuevo agente")
        display_controls()

    def display_controls():
        print("\\nPANEL DE CONTROL")
        print("-" * 30)

        # Mostrar estado actual
        if experiment_state['trained']:
            print("Estado: Agente entrenado y listo para pruebas")
        else:
            print("Estado: Configura parámetros y entrena un agente")

        print()

        # Organizar controles en pestañas
        env_tab = widgets.VBox([
            widgets.HTML("<b> Configuración del Entorno</b>"),
            controls['env_type'],
            widgets.HTML("<i>Solo para entorno 'Aleatorio':</i>"),
            controls['grid_size'],
            controls['hole_prob']
        ])

        agent_tab = widgets.VBox([
            widgets.HTML("<b> Parámetros del Agente</b>"),
            widgets.HTML("<i>α (alpha): Velocidad de aprendizaje</i>"),
            controls['learning_rate'],
            widgets.HTML("<i>γ (gamma): Importancia del futuro</i>"),
            controls['discount'],
            widgets.HTML("<i>ε (epsilon): Exploración inicial</i>"),
            controls['epsilon'],
            widgets.HTML("<i>Decaimiento de ε</i>"),
            controls['epsilon_decay']
        ])

        train_tab = widgets.VBox([
            widgets.HTML("<b> Configuración de Entrenamiento</b>"),
            controls['episodes'],
            controls['test_episodes'],
            controls['show_progress']
        ])

        # Configurar botones según el estado
        if experiment_state['trained']:
            controls['test_button'].disabled = False
            controls['test_button'].button_style = 'info'
            controls['test_button'].description = 'Probar Agente ✓'
        else:
            controls['test_button'].disabled = True
            controls['test_button'].button_style = 'warning'
            controls['test_button'].description = 'Probar Agente (Entrena primero)'

        button_tab = widgets.VBox([
            widgets.HTML("<b>🎮 Acciones</b>"),
            widgets.HBox([
                controls['train_button'],
                controls['test_button'],
                controls['reset_button']
            ], layout=widgets.Layout(justify_content='center')),
            widgets.HTML(f"""
            <div style='margin-top: 10px; padding: 10px; background-color: #f0f0f0; border-radius: 5px;'>
                <b>💡 Instrucciones:</b><br>
                1. Configura los parámetros en las pestañas<br>
                2. Haz clic en 'Entrenar Agente'<br>
                3. Una vez entrenado, usa 'Probar Agente'<br>
                4. Usa 'Reiniciar' para comenzar de nuevo
            </div>
            """)
        ])

        # Crear pestañas
        tabs = widgets.Tab()
        tabs.children = [env_tab, agent_tab, train_tab, button_tab]
        tabs.titles = ['🗺️ Entorno', 'Agente', 'Entrenamiento', 'Acciones']

        display(tabs)

        # Mostrar información adicional
        if experiment_state['env'] is not None:
            info = experiment_state['env'].get_grid_info()
            print(f"\\nEntorno actual: {info['size']}×{info['size']} | {info['holes']} hoyos | {info['difficulty']}")

        if experiment_state['trained'] and experiment_state['agent'] is not None:
            stats = experiment_state['agent'].get_stats()
            print(f"Agente: ε={stats['epsilon']:.3f} | Pasos totales: {stats['total_steps']}")
            if experiment_state['scores']:
                recent_score = np.mean(experiment_state['scores'][-100:])
                print(f"Puntuación promedio reciente: {recent_score:.2f}")

    # Configurar botones iniciales
    controls['test_button'].disabled = True
    controls['test_button'].button_style = 'warning'
    controls['test_button'].description = 'Probar Agente (Entrena primero)'

    # Conectar eventos
    controls['train_button'].on_click(on_train_clicked)
    controls['test_button'].on_click(on_test_clicked)
    controls['reset_button'].on_click(on_reset_clicked)

    # Mostrar interfaz inicial
    display_controls()

    return controls, experiment_state


def quick_experiment(env_type='clasico', episodes=500, learning_rate=0.1, show_details=True):
    """Función para experimentos rápidos sin interfaz"""

    print(f"EXPERIMENTO RÁPIDO: {env_type.upper()}")
    print("="*50)

    # Crear entorno
    if env_type == 'random':
        env = FrozenLakeEnvironment(size=4, hole_probability=0.15)
    else:
        env = FrozenLakeEnvironment.from_pattern(env_type)

    if show_details:
        info = env.get_grid_info()
        print(f"Entorno: {info['size']}×{info['size']} | Hoyos: {info['holes']} | Dificultad: {info['difficulty']}")
        env.render()

    # Crear y entrenar agente
    n_states = env.size * env.size
    n_actions = len(env.actions)

    agent = QLearningAgent(
        n_states=n_states,
        n_actions=n_actions,
        learning_rate=learning_rate,
        discount_factor=0.95,
        epsilon=1.0,
        epsilon_decay=0.995
    )

    # Entrenar
    scores = train_agent_interactive(env, agent, episodes=episodes, show_progress=show_details)

    if show_details:
        # Visualizar resultados
        visualize_complete_results(scores, agent, env)

        # Probar agente
        test_agent_interactive(env, agent, episodes=3)

    return env, agent, scores


def compare_experiments():
    """Compara diferentes configuraciones"""

    print("COMPARACIÓN DE EXPERIMENTOS")
    print("="*50)

    configs = [
        {'name': 'Conservador', 'lr': 0.05, 'episodes': 500},
        {'name': 'Estándar', 'lr': 0.1, 'episodes': 500},
        {'name': 'Agresivo', 'lr': 0.3, 'episodes': 500},
        {'name': 'Más Entrenamiento', 'lr': 0.1, 'episodes': 1500}
    ]

    results = []

    for config in configs:
        print(f"\nProbando configuración: {config['name']}")
        env, agent, scores = quick_experiment(
            env_type='clasico',
            episodes=config['episodes'],
            learning_rate=config['lr'],
            show_details=False
        )

        # Evaluar agente
        agent.epsilon = 0  # Modo explotación
        success_count = 0
        total_steps = 0

        for _ in range(20):  # 20 episodios de prueba
            env.reset()
            steps = 0
            while not env.done and steps < 100:
                action = agent.choose_action(env._get_state())
                env.step(action)
                steps += 1

            if env.grid[env.current_pos[0]][env.current_pos[1]] == 'G':
                success_count += 1
            total_steps += steps

        results.append({
            'name': config['name'],
            'success_rate': success_count / 20,
            'avg_steps': total_steps / 20,
            'final_score': np.mean(scores[-100:]),
            'learning_rate': config['lr'],
            'episodes': config['episodes']
        })

    # Mostrar comparación
    print("\nRESULTADOS COMPARATIVOS")
    print("="*80)
    print(f"{'Configuración':<15} {'Tasa Éxito':<12} {'Pasos Prom':<12} {'Score Final':<12} {'LR':<6} {'Episodios'}")
    print("-"*80)

    for r in results:
        print(f"{r['name']:<15} {r['success_rate']:.1%}{'':7} {r['avg_steps']:<12.1f} "
              f"{r['final_score']:<12.1f} {r['learning_rate']:<6.2f} {r['episodes']}")

    # Gráfico comparativo
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 3, 1)
    names = [r['name'] for r in results]
    success_rates = [r['success_rate'] for r in results]
    plt.bar(names, success_rates, color='green', alpha=0.7)
    plt.title('Tasa de Éxito')
    plt.ylabel('Proporción')
    plt.xticks(rotation=45)

    plt.subplot(1, 3, 2)
    avg_steps = [r['avg_steps'] for r in results]
    plt.bar(names, avg_steps, color='blue', alpha=0.7)
    plt.title('Pasos Promedio')
    plt.ylabel('Pasos')
    plt.xticks(rotation=45)

    plt.subplot(1, 3, 3)
    final_scores = [r['final_score'] for r in results]
    plt.bar(names, final_scores, color='red', alpha=0.7)
    plt.title('Puntuación Final')
    plt.ylabel('Score')
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()

    return results


# Función principal mejorada
def main():
    """Función principal con opciones"""

    print("LABORATORIO DE Q-LEARNING")
    print("="*40)
    print("Selecciona una opción:")
    print("1. Interfaz interactiva completa")
    print("2. Experimento rápido")
    print("3. Comparación de configuraciones")
    print("4. Ejemplo simple")

    return {
        'interactive': create_experiment_interface,
        'quick': quick_experiment,
        'compare': compare_experiments,
        'simple': lambda: quick_experiment('clasico', 300, 0.1, True)
    }


# Para usar en Colab directamente:
def run_interactive():
    """Ejecuta la interfaz interactiva"""
    return create_experiment_interface()

def run_quick(env_type='clasico'):
    """Ejecuta un experimento rápido"""
    return quick_experiment(env_type)

def run_comparison():
    """Ejecuta comparación de experimentos"""
    return compare_experiments()


# Función de inicio automático
def auto_start():
    """Inicia automáticamente un ejemplo interactivo"""
    print("LABORATORIO DE Q-LEARNING - INICIO AUTOMÁTICO")
    print("="*60)
    print("Iniciando interfaz interactiva...")
    print("Puedes modificar los parámetros en los controles que aparecen abajo")
    print("="*60)

    try:
        # Intentar interfaz interactiva
        return create_experiment_interface()
    except ImportError:
        print("Widgets no disponibles, ejecutando experimento simple...")
        return quick_experiment('clasico', 500, 0.1, True)


# Instrucciones adicionales
def show_instructions():
    print("""
 INSTRUCCIONES ADICIONALES:

 FUNCIONES PRINCIPALES:
   • run_interactive() - Interfaz completa con controles
   • run_quick('clasico') - Experimento rápido
   • run_comparison() - Comparar configuraciones

  TIPOS DE ENTORNO:
   • 'clasico' - Grid 4x4 tradicional
   • 'facil' - Pocos obstáculos
   • 'dificil' - Muchos obstáculos
   • 'laberinto' - Grid 5x5 complejo

 EJEMPLO PERSONALIZADO:
   env, agent, scores = quick_experiment(
       env_type='dificil',
       episodes=1000,
       learning_rate=0.2
   )
""")


if __name__ == "__main__":
    # Ejecutar automáticamente al cargar
    auto_start()
else:
    # Si se importa como módulo, mostrar instrucciones y ejecutar
    show_instructions()
    print("\nEJECUTANDO EXPERIMENTO AUTOMÁTICO...")
    auto_start()

Sistema reiniciado completamente
Configura los parámetros y entrena un nuevo agente
\nPANEL DE CONTROL
------------------------------
Estado: Configura parámetros y entrena un agente



Tab(children=(VBox(children=(HTML(value='<b> Configuración del Entorno</b>'), RadioButtons(description='Tipo d…