### 1. Lógica de NN para el 2 armed bandit

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

# ======= 1. ENTORNO: SwapDoorBandit =======
class SwapDoorBandit:
    def __init__(self, episode_length, swap_step=None):
        # Mapeo de cada puerta: (probabilidad, magnitud)
        mapping = {
            'A': (0.75, 10),
            'B': (0.25, 10),
            'C': (0.75,  3),
            'D': (0.25,  3),
        }
        # Parejas posibles
        self.pairs = [('A','B'), ('C','D')]
        # Elegimos aleatoriamente la pareja inicial
        self.current = 0 if np.random.rand() < 0.5 else 1
        self.left_label, self.right_label = self.pairs[self.current]
        self.left_p, self.left_r   = mapping[self.left_label]
        self.right_p, self.right_r = mapping[self.right_label]

        # Paso en que se hará el swap (si no se da, aleatorio entre 1 y episode_length-1)
        self.episode_length = episode_length
        self.swap_step = swap_step if swap_step is not None else np.random.randint(1, episode_length)
        self.step_count = 0

    def pull(self, action):
        # Antes de devolver recompensa, comprobamos si toca swap
        if self.step_count == self.swap_step:
            # Intercambiamos a la otra pareja
            self.current = 1 - self.current
            self.left_label, self.right_label = self.pairs[self.current]
            mapping = {
                'A': (0.75, 10),
                'B': (0.25, 10),
                'C': (0.75,  3),
                'D': (0.25,  3),
            }
            self.left_p, self.left_r   = mapping[self.left_label]
            self.right_p, self.right_r = mapping[self.right_label]

        # Determinamos probabilidad y magnitud según la acción
        if action == 0:
            p, r = self.left_p,  self.left_r
        else:
            p, r = self.right_p, self.right_r

        # Devolvemos +r o -r
        reward = r if np.random.rand() < p else -r
        self.step_count += 1
        return reward

### 2. Entrenamiento con PPO

In [None]:
# ======= 2. MODELO: Agente PPO con LSTM =======
class PPOAgent(nn.Module):
    def __init__(self, input_size=4, hidden_size=32, num_actions=2):
        super(PPOAgent, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTMCell(input_size, hidden_size)
        self.policy_head = nn.Linear(hidden_size, num_actions)
        self.value_head  = nn.Linear(hidden_size, 1)

    def reset_state(self):
        self.hx = torch.zeros(1, self.hidden_size)
        self.cx = torch.zeros(1, self.hidden_size)

    def forward(self, x):
        self.hx, self.cx = self.lstm(x, (self.hx, self.cx))
        return self.policy_head(self.hx), self.value_head(self.hx)

# ======= 3. Crear la entrada del agente =======
def get_input(last_action, last_reward, timestep, num_actions=2):
    a = F.one_hot(torch.tensor([last_action]), num_classes=num_actions).float()
    r = torch.tensor([[last_reward]], dtype=torch.float32)
    t = torch.tensor([[timestep / 10.0]], dtype=torch.float32)
    return torch.cat([a, r, t], dim=1)

# ======= 4. HIPERPARÁMETROS DE PPO =======
gamma        = 0.99
clip_epsilon = 0.2
ppo_epochs   = 4
lr           = 0.001

agent     = PPOAgent()
optimizer = optim.Adam(agent.parameters(), lr=lr)

num_episodes   = 2000
episode_length = 10

# ======= 5. ENTRENAMIENTO CON PPO =======
for episode in range(num_episodes):
    env = SwapDoorBandit(episode_length=episode_length)
    agent.reset_state()

    states, actions, rewards, logps, values = [], [], [], [], []
    last_action, last_reward = 0, 0.0

    # Recolectar un episodio
    for t in range(episode_length):
        x      = get_input(last_action, last_reward, t)
        logits, v = agent(x)
        probs  = F.softmax(logits, dim=1)
        dist   = torch.distributions.Categorical(probs)
        action = dist.sample()
        logp   = dist.log_prob(action)

        states.append(x)
        actions.append(action)
        logps.append(logp)
        values.append(v)

        reward = env.pull(action.item())
        rewards.append(reward)

        last_action, last_reward = action.item(), reward

    # Calcular returns y ventajas
    returns, G = [], 0
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)
    returns    = torch.tensor(returns, dtype=torch.float32).unsqueeze(1)
    values     = torch.cat(values)
    advantages = returns - values.detach()
    old_logps  = torch.cat(logps).detach()

    # Actualización PPO
    for _ in range(ppo_epochs):
        new_logps, new_vals, entropies = [], [], []
        agent.reset_state()
        for i, x in enumerate(states):
            logits, v = agent(x)
            probs  = F.softmax(logits, dim=1)
            dist   = torch.distributions.Categorical(probs)
            new_logps.append(dist.log_prob(actions[i]))
            new_vals.append(v)
            entropies.append(dist.entropy())
        new_logps    = torch.cat(new_logps)
        new_vals     = torch.cat(new_vals)
        entropy_term = torch.cat(entropies).mean()

        ratio       = torch.exp(new_logps - old_logps)
        s1          = ratio * advantages
        s2          = torch.clamp(ratio, 1-clip_epsilon, 1+clip_epsilon) * advantages
        policy_loss = -torch.min(s1, s2).mean()
        value_loss  = F.mse_loss(new_vals, returns)
        loss        = policy_loss + 0.5*value_loss - 0.01*entropy_term

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (episode+1) % 200 == 0:
        print(f"Episode {episode+1}, Total Reward: {sum(rewards)}, Loss: {loss.item():.2f}")

print("Entrenamiento completado.")

Episode 100, Total Reward: 140, Loss: 1698.6903
Episode 200, Total Reward: 60, Loss: 216.9727
Episode 300, Total Reward: 42, Loss: 121.1637
Episode 400, Total Reward: 42, Loss: 114.0210
Episode 500, Total Reward: 36, Loss: 136.1891
Episode 600, Total Reward: 60, Loss: 271.1169
Episode 700, Total Reward: 24, Loss: 267.9634
Episode 800, Total Reward: 120, Loss: 888.8828
Episode 900, Total Reward: 100, Loss: 539.2075
Episode 1000, Total Reward: 140, Loss: 1436.2120
Entrenamiento completado.


# 3. Ejemplo de funcionamiento con un agente en modo evaluación

In [None]:
# ======= 6. EVALUACIÓN =======
agent.eval()
env = SwapDoorBandit(episode_length=episode_length)
agent.reset_state()

print(f"Swap en paso {env.swap_step}, pareja inicial {env.pairs[1-env.current]}, luego {env.pairs[env.current]}")
last_action, last_reward = 0, 0
total_reward = 0

for t in range(episode_length):
    with torch.no_grad():
        x      = get_input(last_action, last_reward, t)
        logits, _ = agent(x)
        probs  = F.softmax(logits, dim=1)[0].tolist()
        action = torch.multinomial(torch.tensor(probs), 1).item()

    reward = env.pull(action)
    total_reward += reward
    print(f"Paso {t} | probs={probs} | Acción={action} | Recompensa={reward}")
    last_action, last_reward = action, reward

print("Recompensa total:", total_reward)

Familia y probs ocultas: [0.75, 0.25] [3, -3]
Paso 0 | Acción: 1 | Recompensa: 3
Paso 1 | Acción: 1 | Recompensa: 3
Paso 2 | Acción: 1 | Recompensa: 3
Paso 3 | Acción: 1 | Recompensa: 3
Paso 4 | Acción: 1 | Recompensa: 3
Paso 5 | Acción: 1 | Recompensa: -3
Paso 6 | Acción: 1 | Recompensa: 3
Paso 7 | Acción: 1 | Recompensa: 3
Paso 8 | Acción: 1 | Recompensa: 3
Paso 9 | Acción: 1 | Recompensa: 3
Paso 10 | Acción: 1 | Recompensa: -3
Paso 11 | Acción: 1 | Recompensa: 3
Paso 12 | Acción: 1 | Recompensa: 3
Paso 13 | Acción: 1 | Recompensa: 3
Paso 14 | Acción: 1 | Recompensa: 3
Paso 15 | Acción: 1 | Recompensa: 3
Paso 16 | Acción: 1 | Recompensa: -3
Paso 17 | Acción: 1 | Recompensa: 3
Paso 18 | Acción: 1 | Recompensa: -3
Paso 19 | Acción: 1 | Recompensa: 3
Recompensa total: 36
