# Anti-CHARM v2 — Contextual Risk Demo

This notebook shows a minimal usage of Anti-CHARM as a contextual
regularizer on top of a simple tabular actor. The full environment
is not modeled here; the goal is only to visualize how the
penalty and λ_t evolve over several steps.


In [None]:
import os, sys, math
import numpy as np
import matplotlib.pyplot as plt

ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
LEVO_PATH = os.path.join(ROOT, "levo")
if LEVO_PATH not in sys.path:
    sys.path.append(LEVO_PATH)

from anti_charm_agent import AntiCharmAgent, AntiCharmConfig

# Small synthetic tabular world
num_states = 16
num_actions = 4

cfg = AntiCharmConfig(num_states=num_states, num_actions=num_actions)
anti = AntiCharmAgent(cfg)

def softmax(x, tau=1.0):
    x = np.asarray(x, dtype=float)\n    x = x / float(tau)
    x = x - x.max()
    ex = np.exp(x)
    s = ex.sum()
    return ex / max(s, 1e-8)

T = 200
rng = np.random.default_rng(0)

lambda_hist = []
penalty_hist = []

# Synthetic Q_charm (e.g., coming from some base actor)
Q = rng.normal(loc=0.0, scale=1.0, size=(num_states, num_actions))

diversity = 0.0
visited_states = set()
cum_reward = 0.0

for t in range(T):
    s = int(rng.integers(0, num_states))
    visited_states.add(s)

    # base policy from the main actor
    p_base = softmax(Q[s], tau=1.0)
    H_policy = -np.sum(p_base * np.log(np.clip(p_base, 1e-8, 1.0)))

    # simple global stats
    diversity = len(visited_states) / float(num_states)
    reward_density = cum_reward / float(t + 1) if t > 0 else 0.0

    stats = dict(
        step_norm=t / float(T),
        H_policy=H_policy,
        temp=1.0,
        diversity=diversity,
        reward_density=reward_density,
    )

    p_vec, lambda_t = anti.penalty_vector(s, stats)

    # effective action and synthetic reward
    Q_eff = Q[s] - lambda_t * p_vec
    a = int(np.argmax(Q_eff))
    r = float(Q_eff[a])  # illustrative only
    cum_reward += r

    lambda_hist.append(lambda_t)
    penalty_hist.append(p_vec.mean())

lambda_hist = np.array(lambda_hist)
penalty_hist = np.array(penalty_hist)

print(f"Simulated steps: {T}")
print(f"Mean lambda_t: {lambda_hist.mean():.3f}")
print(f"Mean penalty: {penalty_hist.mean():.3f}")


In [None]:
# λ_t evolution
plt.figure()
plt.plot(lambda_hist)
plt.xlabel("Simulation step")
plt.ylabel("lambda_t")
plt.title("Anti-CHARM — λ_t evolution")
plt.show()


In [None]:
# Mean P_anti penalty
plt.figure()
plt.plot(penalty_hist)
plt.xlabel("Simulation step")
plt.ylabel("Mean P_anti")
plt.title("Anti-CHARM — average penalty")
plt.show()
