<a href="https://colab.research.google.com/github/NNehmer/stc-alberta/blob/main/STC_Alberta_Agent_V1_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip -q install --upgrade torch

In [None]:
import torch, sys
print("Torch:", torch.__version__, "Python:", sys.version.split()[0])

Torch: 2.8.0+cu126 Python: 3.12.11


In [None]:
# Optional (Colab): %pip -q install --upgrade torch

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import deque
import random
from typing import Dict, Tuple, List

# ---------------------------
# Reproduzierbarkeit
# ---------------------------
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(42)

# ==========================================
# 1) STC-Bausteine
# ==========================================
class SpectralProjector(nn.Module):
    """Intentionaler Subraum-Projektor Π_S (QR-orthonormalisiert)."""
    def __init__(self, latent_dim: int, intent_rank: int):
        super().__init__()
        self.latent_dim  = latent_dim
        self.intent_rank = intent_rank
        self.basis = nn.Parameter(torch.randn(latent_dim, intent_rank) * 0.1)

    def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        Q, _ = torch.linalg.qr(self.basis)   # [D, r], orthonormal
        Pi_S = Q @ Q.T                       # [D, D]
        z_S  = z @ Pi_S
        return z_S, Pi_S

def coherence(z: torch.Tensor, z_S: torch.Tensor) -> torch.Tensor:
    """κ(ψ) = ||Π_S ψ||² / ||ψ||²  (mit keepdim=True)."""
    nz  = torch.norm(z,   dim=-1, keepdim=True) + 1e-8
    nzS = torch.norm(z_S, dim=-1, keepdim=True)
    return (nzS / nz) ** 2

class ValueOperator(nn.Module):
    """Symmetrischer Wertoperator V; v(ψ)=⟨ψ|V|ψ⟩."""
    def __init__(self, latent_dim: int):
        super().__init__()
        self.W = nn.Parameter(torch.randn(latent_dim, latent_dim) * 0.01)

    def forward(self, z_S: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        V = 0.5 * (self.W + self.W.T)
        v = torch.einsum('bi,ij,bj->b', z_S, V, z_S)
        return v.unsqueeze(-1), V

# ==========================================
# 2) Agent mit Policy-Mix & optionalem Concat-Head
# ==========================================
class STCAlbertaAgent(nn.Module):
    def __init__(self, obs_dim: int, action_dim: int, latent_dim: int = 64, intent_rank: int = 16):
        super().__init__()
        self.obs_dim     = obs_dim
        self.action_dim  = action_dim
        self.latent_dim  = latent_dim
        self.intent_rank = intent_rank

        self.encoder = nn.Sequential(
            nn.Linear(obs_dim, 128), nn.LayerNorm(128), nn.ReLU(),
            nn.Linear(128, latent_dim)
        )
        self.projector   = SpectralProjector(latent_dim, intent_rank)
        self.value_op    = ValueOperator(latent_dim)

        # EMA-Target für TD-Stabilität
        self.value_target = ValueOperator(latent_dim)
        self.value_target.load_state_dict(self.value_op.state_dict())
        self.value_tau = 0.005

        # Policy "mix": gleiche Dim wie z/z_S
        self.policy = nn.Sequential(
            nn.Linear(latent_dim, 128), nn.ReLU(), nn.Linear(128, action_dim)
        )
        # Policy "concat": optionaler Head auf [z, z_S] (2*latent)
        self.policy_concat = nn.Sequential(
            nn.Linear(2*latent_dim, 128), nn.ReLU(), nn.Linear(128, action_dim)
        )

        # Dynamik & Reward im z_S-Raum
        self.transition = nn.Sequential(
            nn.Linear(latent_dim + action_dim, 128), nn.ReLU(), nn.Linear(128, latent_dim)
        )
        self.reward_head = nn.Sequential(
            nn.Linear(latent_dim + action_dim, 64), nn.ReLU(), nn.Linear(64, 1)
        )

    @torch.no_grad()
    def update_value_target(self):
        for p, pt in zip(self.value_op.parameters(), self.value_target.parameters()):
            pt.data.mul_(1 - self.value_tau).add_(self.value_tau * p.data)

    def forward(
        self,
        obs: torch.Tensor,
        mix_alpha: float = 1.0,          # 0 → reine z, 1 → reine z_S
        policy_mode: str = "mix"         # "mix", "concat" oder "zs" (nur z_S)
    ) -> Dict[str, torch.Tensor]:
        z      = self.encoder(obs)       # [B,D]
        z_S, P = self.projector(z)       # [B,D], [D,D]
        kappa  = coherence(z, z_S)       # [B,1]
        value, V = self.value_op(z_S)    # [B,1], [D,D]

        if policy_mode == "mix":
            feat   = (1.0 - mix_alpha) * z + mix_alpha * z_S
            logits = self.policy(feat)
        elif policy_mode == "concat":
            feat   = torch.cat([z, z_S], dim=-1)
            logits = self.policy_concat(feat)
        elif policy_mode == "zs":
            logits = self.policy(z_S)
        else:
            logits = self.policy(z_S)

        return {
            'latent': z, 'latent_S': z_S, 'projector': P,
            'value_matrix': V, 'kappa': kappa, 'value': value, 'logits': logits
        }

    def imagine(self, z_S: torch.Tensor, actions: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """actions: [B] (indices) oder [B,A] (one-hot)."""
        if actions.dim() == 1:
            a_oh = F.one_hot(actions.long(), num_classes=self.action_dim).float()
        elif actions.dim() == 2 and actions.size(-1) == self.action_dim:
            a_oh = actions.float()
        else:
            raise ValueError("actions must be [B] (indices) or [B,A] (one-hot)")
        za = torch.cat([z_S, a_oh], dim=-1)
        z_S_next = self.transition(za)
        reward   = self.reward_head(za)
        return z_S_next, reward

    def select_action(self, obs: torch.Tensor, epsilon: float = 0.0, mix_alpha: float = 1.0, policy_mode: str = "mix") -> int:
        with torch.no_grad():
            out = self.forward(obs.unsqueeze(0), mix_alpha=mix_alpha, policy_mode=policy_mode)
            if random.random() < epsilon:
                return random.randint(0, self.action_dim - 1)
            return out['logits'].argmax(dim=-1).item()

# ==========================================
# 3) Verlustfunktion (inkl. Advantage-Normierung & Policy-Mix)
# ==========================================
def stc_loss(
    agent: STCAlbertaAgent,
    obs: torch.Tensor, actions: torch.Tensor, rewards: torch.Tensor,
    next_obs: torch.Tensor, dones: torch.Tensor,
    gamma: float = 0.99,
    lambda_kappa: float = 0.1,
    lambda_comm: float = 0.01,
    lambda_cons: float = 1.0,
    entropy_coef: float = 5e-4,
    # für die Policy:
    mix_alpha: float = 1.0,
    policy_mode: str = "mix",
) -> Dict[str, torch.Tensor]:

    out      = agent(obs,      mix_alpha=mix_alpha, policy_mode=policy_mode)
    out_next = agent(next_obs, mix_alpha=mix_alpha, policy_mode=policy_mode)

    z_S, P, V     = out['latent_S'], out['projector'], out['value_matrix']
    kappa         = out['kappa']
    value, logits = out['value'], out['logits']

    dist  = torch.distributions.Categorical(logits=logits)
    logp  = dist.log_prob(actions)

    # TD-Target mit EMA-Target
    with torch.no_grad():
        v_next_target, _ = agent.value_target(out_next['latent_S'])
        td_target = rewards.unsqueeze(-1) + gamma * v_next_target * (1 - dones.unsqueeze(-1))

    td_err = td_target - value

    # (a) Advantage-Normierung
    adv = td_err.squeeze(-1).detach()
    adv = (adv - adv.mean()) / (adv.std() + 1e-8)

    policy_loss = -(logp * adv).mean()
    value_loss  = F.mse_loss(value, td_target)
    entropy     = dist.entropy().mean()
    L_RL        = policy_loss + value_loss - entropy_coef * entropy

    # Relevanz-Maximierung
    L_rel  = -kappa.mean()

    # Kommutator-Penalty (Frobenius, skaliert)
    comm   = V @ P - P @ V
    L_comm = (torch.norm(comm, p='fro') ** 2) / V.numel()

    # Konsistenz: Modell-Q vs Target-Q
    z_S_next_pred, reward_pred = agent.imagine(z_S, actions)
    with torch.no_grad():
        v_next_pred_target, _ = agent.value_target(z_S_next_pred)
    q_pred   = reward_pred + gamma * v_next_pred_target * (1 - dones.unsqueeze(-1))
    q_target = rewards.unsqueeze(-1) + gamma * v_next_target * (1 - dones.unsqueeze(-1))
    L_cons   = F.mse_loss(q_pred, q_target)

    total = L_RL + lambda_kappa * L_rel + lambda_comm * L_comm + lambda_cons * L_cons
    return {
        'total_loss': total, 'L_RL': L_RL, 'L_rel': L_rel, 'L_comm': L_comm, 'L_cons': L_cons,
        'kappa': kappa.mean(), 'td_error': td_err.abs().mean(),
        'policy_loss': policy_loss, 'value_loss': value_loss, 'entropy': entropy
    }

# ==========================================
# 4) Einfache 2D-Nav-Umwelt (Reward-Shaping optional)
# ==========================================
class SimpleControlEnv:
    """
    State: [x, y, goal_x, goal_y, vx, vy] (6D)
    Actions: 0:up, 1:down, 2:left, 3:right
    Reward:  -distance + progress_bonus * (prev_dist - dist)
    Done:    dist < 0.1 oder steps >= max_steps
    """
    def __init__(self, max_steps: int = 200, progress_bonus: float = 0.0):
        self.state_dim  = 6
        self.action_dim = 4
        self.max_steps  = max_steps
        self.progress_bonus = progress_bonus
        self.reset()

    def reset(self) -> np.ndarray:
        self.pos   = np.random.uniform(-1, 1, 2)
        self.vel   = np.zeros(2)
        self.goal  = np.random.uniform(-1, 1, 2)
        self.steps = 0
        return self._get_obs()

    def _get_obs(self) -> np.ndarray:
        return np.concatenate([self.pos, self.goal, self.vel])

    def step(self, action: int):
        prev_pos  = self.pos.copy()
        prev_dist = float(np.linalg.norm(prev_pos - self.goal))

        acc_map = {0: np.array([0, 0.1]), 1: np.array([0, -0.1]),
                   2: np.array([-0.1, 0]), 3: np.array([0.1, 0])}
        acc  = acc_map[action]
        self.vel = np.clip(0.9 * self.vel + acc, -0.5, 0.5)
        self.pos = np.clip(self.pos + self.vel, -1,  1)

        dist   = float(np.linalg.norm(self.pos - self.goal))
        reward = -dist + self.progress_bonus * (prev_dist - dist)

        self.steps += 1
        done = (dist < 0.1) or (self.steps >= self.max_steps)
        return self._get_obs(), reward, done, {'distance': dist}

# ==========================================
# 5) Replay Buffer
# ==========================================
class ReplayBuffer:
    def __init__(self, capacity: int = 10000):
        self.buffer = deque(maxlen=capacity)
    def push(self, obs, action, reward, next_obs, done):
        self.buffer.append((obs, action, reward, next_obs, done))
    def sample(self, batch_size: int) -> Dict[str, torch.Tensor]:
        batch = random.sample(self.buffer, batch_size)
        obs, actions, rewards, next_obs, dones = zip(*batch)
        return {
            'obs':      torch.FloatTensor(np.array(obs)),
            'actions':  torch.LongTensor(actions),
            'rewards':  torch.FloatTensor(rewards),
            'next_obs': torch.FloatTensor(np.array(next_obs)),
            'dones':    torch.FloatTensor(dones)
        }
    def __len__(self):
        return len(self.buffer)

# ==========================================
# 6) Training (Warm-up & Ramp & Policy-Mix + (b) Projector Freeze)
# ==========================================
def train_stc_alberta(
    num_episodes: int = 150,
    batch_size: int = 64,
    buffer_size: int = 10000,
    learning_rate: float = 3e-4,
    epsilon_start: float = 0.5,
    epsilon_end: float = 0.05,
    epsilon_decay: float = 0.992,
    print_every: int = 10,
    # Zielgewichte (Ende der Ramp)
    lambda_kappa: float = 0.02,
    lambda_comm: float = 0.01,
    lambda_cons: float = 1.3,
    gamma: float = 0.99,
    entropy_coef: float = 2e-4,
    progress_bonus: float = 0.0,
    # Scheduling & Policy-Mix
    warmup_episodes: int = 25,
    ramp_episodes: int = 80,
    policy_mode: str = "concat",    # "mix", "concat" oder "zs"
    alpha_max: float = 0.7,         # maximaler Anteil z_S in der Policy
    # Π_S-Rank
    intent_rank: int = 8,
):
    env      = SimpleControlEnv(progress_bonus=progress_bonus)
    agent    = STCAlbertaAgent(obs_dim=env.state_dim, action_dim=env.action_dim,
                               latent_dim=64, intent_rank=intent_rank)
    optimizer= torch.optim.Adam(agent.parameters(), lr=learning_rate)
    buffer   = ReplayBuffer(capacity=buffer_size)

    epsilon  = epsilon_start
    ep_rewards, ep_kappas, ep_dists = [], [], []

    print("="*70)
    print("STC–Alberta Agent Training")
    print("="*70)
    print("Environment: SimpleControlEnv (2D)")
    print(f"State dim: {env.state_dim}, Action dim: {env.action_dim}")
    print(f"Agent latent dim: 64, Intent rank: {intent_rank}")
    print("="*70); print()

    projector_frozen = False

    for episode in range(num_episodes):
        # --- Schedule ---
        if episode < warmup_episodes:
            ramp = 0.0
        else:
            ramp = min(1.0, (episode - warmup_episodes) / max(1, ramp_episodes))

        lambda_kappa_eff = lambda_kappa * ramp
        lambda_comm_eff  = lambda_comm  * ramp
        mix_alpha        = alpha_max * ramp  # α begrenzen

        # (b) Projektor nach Ramp-Ende freezen (einmalig)
        if (ramp >= 1.0) and (not projector_frozen):
            for p in agent.projector.parameters():
                p.requires_grad = False
            projector_frozen = True
            print("[info] Projector frozen for late-stage polishing.")

        obs = env.reset()
        ep_reward = 0.0
        ep_k_list: List[float] = []
        done = False

        while not done:
            action = agent.select_action(torch.FloatTensor(obs), epsilon,
                                         mix_alpha=mix_alpha, policy_mode=policy_mode)
            next_obs, reward, done, info = env.step(action)
            buffer.push(obs, action, reward, next_obs, float(done))

            with torch.no_grad():
                out = agent(torch.FloatTensor(obs).unsqueeze(0),
                            mix_alpha=mix_alpha, policy_mode=policy_mode)
                ep_k_list.append(out['kappa'].item())

            obs = next_obs
            ep_reward += reward

        ep_rewards.append(ep_reward)
        ep_kappas.append(float(np.mean(ep_k_list)))
        ep_dists.append(info['distance'])

        if len(buffer) >= batch_size:
            batch = buffer.sample(batch_size)
            optimizer.zero_grad()
            losses = stc_loss(
                agent, **batch,
                gamma=gamma,
                lambda_kappa=lambda_kappa_eff,
                lambda_comm=lambda_comm_eff,
                lambda_cons=lambda_cons,
                entropy_coef=entropy_coef,
                mix_alpha=mix_alpha,
                policy_mode=policy_mode,
            )
            losses['total_loss'].backward()
            torch.nn.utils.clip_grad_norm_(agent.parameters(), 1.0)
            optimizer.step()
            agent.update_value_target()

        epsilon = max(epsilon_end, epsilon * epsilon_decay)

        if (episode + 1) % print_every == 0:
            avgR = float(np.mean(ep_rewards[-print_every:]))
            avgK = float(np.mean(ep_kappas[-print_every:]))
            avgD = float(np.mean(ep_dists[-print_every:]))
            msg  = (f"Episode {episode+1:4d} | Reward: {avgR:7.2f} | κ: {avgK:.3f} "
                    f"| Dist: {avgD:.3f} | ε: {epsilon:.3f} | ramp: {ramp:.2f} | α: {mix_alpha:.2f}")
            if len(buffer) >= batch_size:
                msg += (f"\n             | L_RL: {losses['L_RL'].item():.4f} "
                        f"| L_rel: {losses['L_rel'].item():.4f} "
                        f"| L_comm: {losses['L_comm'].item():.6f} "
                        f"| L_cons: {losses['L_cons'].item():.4f} "
                        f"| H: {losses['entropy'].item():.3f}")
            print(msg)

    # Abschluss-Report
    k_tail = np.array(ep_kappas[-30:], dtype=np.float32)
    r_tail = np.array(ep_rewards[-30:], dtype=np.float32)
    corr = float(np.corrcoef(k_tail, r_tail)[0, 1]) if len(k_tail) > 1 else float("nan")

    print()
    print("="*70)
    print("Training Complete")
    print("="*70)
    print(f"Final κ (last 30): {np.mean(k_tail):.3f}")
    print(f"Final reward (last 30): {np.mean(r_tail):.2f}")
    print(f"Final distance (last 30): {np.mean(ep_dists[-30:]):.3f}")
    print(f"Corr(Return, κ) (last 30): {corr:.3f}")

    return agent, ep_rewards, ep_kappas, ep_dists

# ==========================================
# 7) Ablationen
# ==========================================
def run_ablation_no_projector(num_episodes: int = 80) -> float:
    """A1: Kein Π_S (volle latente Fläche)."""
    print("\n"+"="*70); print("ABLATION A1: No Projector (Full Latent Space)"); print("="*70)

    class NoProjAgent(STCAlbertaAgent):
        def forward(self, obs, mix_alpha: float = 0.0, policy_mode: str = "mix"):
            z   = self.encoder(obs)
            z_S = z
            P   = torch.eye(self.latent_dim, device=z.device)
            k   = torch.ones(z.shape[0], 1, device=z.device)
            v, V = self.value_op(z_S)
            if policy_mode == "concat":
                logits = self.policy_concat(torch.cat([z, z_S], dim=-1))
            elif policy_mode == "mix":
                logits = self.policy(z)  # kein Bottleneck
            else:
                logits = self.policy(z)
            return {'latent': z, 'latent_S': z_S, 'projector': P,
                    'value_matrix': V, 'kappa': k, 'value': v, 'logits': logits}

    env = SimpleControlEnv()
    agent = NoProjAgent(obs_dim=6, action_dim=4, latent_dim=64, intent_rank=16)
    optimizer = torch.optim.Adam(agent.parameters(), lr=3e-4)
    buffer = ReplayBuffer()
    ep_rewards: List[float] = []

    for episode in range(num_episodes):
        obs = env.reset()
        ep_reward = 0.0
        done = False
        while not done:
            action = agent.select_action(torch.FloatTensor(obs), epsilon=0.1, mix_alpha=0.0, policy_mode="mix")
            next_obs, reward, done, _ = env.step(action)
            buffer.push(obs, action, reward, next_obs, float(done))
            obs = next_obs
            ep_reward += reward

        ep_rewards.append(ep_reward)

        if len(buffer) >= 64:
            batch = buffer.sample(64)
            optimizer.zero_grad()
            losses = stc_loss(agent, **batch, lambda_kappa=0.0, mix_alpha=0.0, policy_mode="mix")
            losses['total_loss'].backward()
            optimizer.step()
            agent.update_value_target()

    val = float(np.mean(ep_rewards[-30:]))
    print(f"Final reward (A1, last 30): {val:.2f}")
    return val

def run_ablation_random_projector(num_episodes: int = 80) -> float:
    """A2: Zufälliger (fixierter) Π_S (ohne Lernen) — sollte schlechter sein als gelernter Π_S."""
    print("\n"+"="*70); print("ABLATION A2: Random Fixed Projector"); print("="*70)

    class RandProjAgent(STCAlbertaAgent):
        def __init__(self, obs_dim, action_dim, latent_dim=64, intent_rank=16):
            super().__init__(obs_dim, action_dim, latent_dim, intent_rank)
            with torch.no_grad():
                Q, _ = torch.linalg.qr(torch.randn(latent_dim, intent_rank))
                self.register_buffer('Q_fixed', Q)
        def forward(self, obs, mix_alpha: float = 1.0, policy_mode: str = "mix"):
            z = self.encoder(obs)
            Q = self.Q_fixed
            P = Q @ Q.T
            z_S = z @ P
            k   = coherence(z, z_S)
            v, V = self.value_op(z_S)
            if policy_mode == "concat":
                logits = self.policy_concat(torch.cat([z, z_S], dim=-1))
            elif policy_mode == "mix":
                feat = (1.0 - mix_alpha) * z + mix_alpha * z_S
                logits = self.policy(feat)
            else:
                logits = self.policy(z_S)
            return {'latent': z, 'latent_S': z_S, 'projector': P,
                    'value_matrix': V, 'kappa': k, 'value': v, 'logits': logits}

    env = SimpleControlEnv()
    agent = RandProjAgent(obs_dim=6, action_dim=4, latent_dim=64, intent_rank=16)
    optimizer = torch.optim.Adam(agent.parameters(), lr=3e-4)
    buffer = ReplayBuffer()
    ep_rewards: List[float] = []

    for episode in range(num_episodes):
        obs = env.reset()
        ep_reward = 0.0
        done = False
        while not done:
            action = agent.select_action(torch.FloatTensor(obs), epsilon=0.1, mix_alpha=1.0, policy_mode="mix")
            next_obs, reward, done, _ = env.step(action)
            buffer.push(obs, action, reward, next_obs, float(done))
            obs = next_obs
            ep_reward += reward

        ep_rewards.append(ep_reward)

        if len(buffer) >= 64:
            batch = buffer.sample(64)
            optimizer.zero_grad()
            losses = stc_loss(agent, **batch, mix_alpha=1.0, policy_mode="mix")
            losses['total_loss'].backward()
            optimizer.step()
            agent.update_value_target()

    val = float(np.mean(ep_rewards[-30:]))
    print(f"Final reward (A2, last 30): {val:.2f}")
    return val


In [None]:
agent, rewards, kappas, dists = train_stc_alberta(
    num_episodes=150,
    print_every=10,
    # Zielgewichte (Ende der Ramp):
    lambda_kappa=0.02,
    lambda_comm=0.01,
    lambda_cons=1.3,
    entropy_coef=2e-4,
    progress_bonus=0.0,     # erst ohne Shaping messen
    # Scheduling & Policy-Mix:
    warmup_episodes=25,
    ramp_episodes=80,
    policy_mode="concat",
    alpha_max=0.7,
    # Π_S-Rank:
    intent_rank=8,
)

a1 = run_ablation_no_projector(num_episodes=80)
a2 = run_ablation_random_projector(num_episodes=80)

print("\n" + "="*70)
print("RESULTS SUMMARY")
print("="*70)
print(f"STC–Alberta (with Π_S): {np.mean(rewards[-30:]):.2f} (last 30 eps)")
print(f"Ablation A1 (no Π_S):   {a1:.2f} (last 30 eps)")
print(f"Ablation A2 (rand Π_S): {a2:.2f} (last 30 eps)")
print(f"Δ vs A1:                {np.mean(rewards[-30:]) - a1:.2f}")
print(f"Δ vs A2:                {np.mean(rewards[-30:]) - a2:.2f}")
print("="*70)


STC–Alberta Agent Training
Environment: SimpleControlEnv (2D)
State dim: 6, Action dim: 4
Agent latent dim: 64, Intent rank: 8

Episode   10 | Reward: -258.67 | κ: 0.130 | Dist: 1.215 | ε: 0.461 | ramp: 0.00 | α: 0.00
             | L_RL: 1.8712 | L_rel: -0.2220 | L_comm: 0.000011 | L_cons: 2.2977 | H: 1.381
Episode   20 | Reward: -280.11 | κ: 0.356 | Dist: 1.572 | ε: 0.426 | ramp: 0.00 | α: 0.00
             | L_RL: 0.5021 | L_rel: -0.4696 | L_comm: 0.000016 | L_cons: 2.0876 | H: 1.371
Episode   30 | Reward: -246.45 | κ: 0.518 | Dist: 1.247 | ε: 0.393 | ramp: 0.05 | α: 0.03
             | L_RL: 0.3288 | L_rel: -0.5086 | L_comm: 0.000017 | L_cons: 1.7187 | H: 1.363
Episode   40 | Reward: -269.08 | κ: 0.508 | Dist: 1.375 | ε: 0.363 | ramp: 0.17 | α: 0.12
             | L_RL: 0.2825 | L_rel: -0.5323 | L_comm: 0.000016 | L_cons: 1.1106 | H: 1.351
Episode   50 | Reward: -285.64 | κ: 0.572 | Dist: 1.428 | ε: 0.335 | ramp: 0.30 | α: 0.21
             | L_RL: 0.2367 | L_rel: -0.6161 | L_comm: