# $\gamma = 1/2$ 

In [3]:
import sys
from pathlib import Path
import torch

ROOT = Path.cwd().parent  # notebook/ 的 parent
sys.path.insert(0, str(ROOT))

import dlphys.models  # trigger registration
from dlphys.config.base import ExperimentConfig
from dlphys.config.registry import build_model
from dlphys.analysis.jvp import jvp_F
from dlphys.utils.seed import set_seed

set_seed(0, deterministic=True)

L = 100
d_model = 16
d_k = 32
gamma = 0.5

cfg = ExperimentConfig(
    project_name="toy",
    device="cpu",
    seed=0,
    deterministic=True,
    extra={
        "model_name": "toy_attention",
        "model_kwargs": dict(d_model=d_model, d_k=d_k, L=L, num_heads=1, gamma=gamma, phi="identity"),
    }
)
m = build_model(cfg).eval()

B = 1
s = torch.randn(B, L+1, d_model)

NameError: name 'ToyAttentionConfig' is not defined

In [None]:
w = m.Wq[0].weight
print("Wq mean/std:", w.mean().item(), w.std(unbiased=False).item())
print("target std:", 1.0 / (m.cfg.d_model ** 0.5))

In [None]:
from dlphys.analysis.lyapunov import lyapunov_max_benettin

# 用同一个初态（或拷贝一份）
s0 = s.clone()

F = lambda _s: m(_s)

out = lyapunov_max_benettin(
    F, s0,
    burn_in=int(1e4 * 0.9),
    T=1e4,
    return_traj=True
)

print("lambda_hat (per batch):", out["lambda_hat"])
print("lambda_mean:", out["lambda_mean"])

In [1]:
@torch.no_grad()
def sensitivity_profile(m, s, n_trials=5):
    """
    Returns: sens[ L+1 ] where sens[tau] = E || d x_next / d s_tau || (via JVP)
    """
    B, T, d = s.shape
    assert B == 1
    F = lambda _s: m(_s)  # returns full s_next

    sens = torch.zeros(T)
    for tau in range(T):
        vals = []
        for _ in range(n_trials):
            v = torch.zeros_like(s)
            v[:, tau, :] = torch.randn_like(v[:, tau, :])  # perturb only this slot
            jvp = jvp_F(F, s, v)  # shape [1,T,d]
            dxnext = jvp[:, 0, :]  # output token perturbation
            vals.append(dxnext.norm(dim=-1).item())
        sens[tau] = sum(vals) / len(vals)
    return sens

sens = sensitivity_profile(m, s, n_trials=3)
print("sens shape:", sens.shape)
print("sens (first 10):", sens[:10])
print("ratio max/min:", (sens.max()/ (sens.min()+1e-12)).item())

NameError: name 'torch' is not defined

In [6]:
p = sens / (sens.sum() + 1e-12)
entropy = (-(p * (p + 1e-12).log()).sum()).item()
entropy_uniform = torch.log(torch.tensor(float(L+1))).item()
print("entropy proxy:", entropy, "uniform entropy:", entropy_uniform)

NameError: name 'sens' is not defined

# $\gamma=1$

In [13]:
import sys
from pathlib import Path
import torch

ROOT = Path.cwd().parent  # notebook/ 的 parent
sys.path.insert(0, str(ROOT))

import dlphys.models  # trigger registration
from dlphys.config.base import ExperimentConfig
from dlphys.config.registry import build_model
from dlphys.analysis.jvp import jvp_F
from dlphys.utils.seed import set_seed

set_seed(0, deterministic=True)

L = 200
d_model = 16
d_k = 32
gamma = 0.8

cfg = ExperimentConfig(
    project_name="toy",
    device="cpu",
    seed=0,
    deterministic=True,
    extra={
        "model_name": "toy_attention",
        "model_kwargs": dict(d_model=d_model, d_k=d_k, L=L, num_heads=1, gamma=gamma, phi="identity"),
    }
)
m = build_model(cfg).eval()

B = 1
s = torch.randn(B, L+1, d_model)

In [15]:
from dlphys.analysis.lyapunov import lyapunov_max_benettin

# 用同一个初态（或拷贝一份）
s0 = s.clone()

F = lambda _s: m(_s)

out = lyapunov_max_benettin(
    F, s0,
    burn_in=int(2e4 * 0.9),
    T=2e4,
    return_traj=True
)

print("lambda_hat (per batch):", out["lambda_hat"])
print("lambda_mean:", out["lambda_mean"])

lambda_hat (per batch): tensor([-0.0053])
lambda_mean: tensor(-0.0053)


In [16]:
@torch.no_grad()
def sensitivity_profile(m, s, n_trials=5):
    """
    Returns: sens[ L+1 ] where sens[tau] = E || d x_next / d s_tau || (via JVP)
    """
    B, T, d = s.shape
    assert B == 1
    F = lambda _s: m(_s)  # returns full s_next

    sens = torch.zeros(T)
    for tau in range(T):
        vals = []
        for _ in range(n_trials):
            v = torch.zeros_like(s)
            v[:, tau, :] = torch.randn_like(v[:, tau, :])  # perturb only this slot
            jvp = jvp_F(F, s, v)  # shape [1,T,d]
            dxnext = jvp[:, 0, :]  # output token perturbation
            vals.append(dxnext.norm(dim=-1).item())
        sens[tau] = sum(vals) / len(vals)
    return sens

sens = sensitivity_profile(m, s, n_trials=3)
print("sens shape:", sens.shape)
print("sens (first 10):", sens[:10])
print("ratio max/min:", (sens.max()/ (sens.min()+1e-12)).item())

sens shape: torch.Size([201])
sens (first 10): tensor([0.0848, 0.0108, 0.0140, 0.0088, 0.0121, 0.0145, 0.0090, 0.0139, 0.0098,
        0.0112])
ratio max/min: 11.59415054321289


In [17]:
p = sens / (sens.sum() + 1e-12)
entropy = (-(p * (p + 1e-12).log()).sum()).item()
entropy_uniform = torch.log(torch.tensor(float(L+1))).item()
print("entropy proxy:", entropy, "uniform entropy:", entropy_uniform)

entropy proxy: 5.250389099121094 uniform entropy: 5.303304672241211


# Gain test

In [1]:
from dlphys.utils.seed import set_seed
from dlphys.config.base import ExperimentConfig
from dlphys.config.registry import build_model
from dlphys.analysis.lyapunov import lyapunov_max_benettin
import torch

def run_lambda_for_gain(g_v, *, seed=0, L=200, d_model=16, d_k=32, gamma=0.5, H=1,
                        burn_in=1500, T=2000, B=2, device="cpu"):

    set_seed(seed, deterministic=True)

    model_kwargs = dict(
        d_model=d_model,
        d_k=d_k,
        L=L,
        num_heads=H,
        gamma=gamma,
        phi="identity",
        bias=False,
        g_qk=1.0,
        g_v=float(g_v),
    )

    cfg = ExperimentConfig(
        project_name="toy",
        device=device,
        seed=seed,
        deterministic=True,
        extra={
            "model_name": "toy_attention",
            "model_kwargs": model_kwargs,
        },
    )

    # HARD CHECK (this should NOT be None)
    print("DEBUG model_kwargs keys:", cfg.extra["model_kwargs"].keys())
    print("DEBUG g_v passed:", cfg.extra["model_kwargs"].get("g_v", None))

    m = build_model(cfg).eval()
    s0 = torch.randn(B, L+1, d_model, device=device)
    F = lambda s: m(s)

    out = lyapunov_max_benettin(F, s0, burn_in=burn_in, T=T, return_traj=False)
    return out["lambda_mean"].item()

ModuleNotFoundError: No module named 'dlphys'

In [15]:
lam = run_lambda_for_gain(2.0, seed=0)
print("lambda:", lam)

DEBUG model_kwargs keys: dict_keys(['d_model', 'd_k', 'L', 'num_heads', 'gamma', 'phi', 'bias', 'g_qk', 'g_v'])
DEBUG g_v passed: 2.0
lambda: 0.000786280375905335


In [17]:
g_list = [0.5, 0.8, 1.0, 1.1, 1.2, 1.4, 1.7, 2.0]
for g in g_list:
    lam = run_lambda_for_gain(g, seed=0)
    print(f"g_v={g:>4}: lambda_mean={lam:+.6f}")

DEBUG model_kwargs keys: dict_keys(['d_model', 'd_k', 'L', 'num_heads', 'gamma', 'phi', 'bias', 'g_qk', 'g_v'])
DEBUG g_v passed: 0.5
g_v= 0.5: lambda_mean=+0.000786
DEBUG model_kwargs keys: dict_keys(['d_model', 'd_k', 'L', 'num_heads', 'gamma', 'phi', 'bias', 'g_qk', 'g_v'])
DEBUG g_v passed: 0.8
g_v= 0.8: lambda_mean=+0.000786
DEBUG model_kwargs keys: dict_keys(['d_model', 'd_k', 'L', 'num_heads', 'gamma', 'phi', 'bias', 'g_qk', 'g_v'])
DEBUG g_v passed: 1.0
g_v= 1.0: lambda_mean=+0.000786
DEBUG model_kwargs keys: dict_keys(['d_model', 'd_k', 'L', 'num_heads', 'gamma', 'phi', 'bias', 'g_qk', 'g_v'])
DEBUG g_v passed: 1.1
g_v= 1.1: lambda_mean=+0.000786
DEBUG model_kwargs keys: dict_keys(['d_model', 'd_k', 'L', 'num_heads', 'gamma', 'phi', 'bias', 'g_qk', 'g_v'])
DEBUG g_v passed: 1.2
g_v= 1.2: lambda_mean=+0.000786
DEBUG model_kwargs keys: dict_keys(['d_model', 'd_k', 'L', 'num_heads', 'gamma', 'phi', 'bias', 'g_qk', 'g_v'])
DEBUG g_v passed: 1.4
g_v= 1.4: lambda_mean=+0.000786
DEBU

In [19]:
import math, torch

print("id(m) =", id(m))

# check cfg inside model
print("m.cfg.g_v =", getattr(m.cfg, "g_v", None), "m.cfg.g_qk =", getattr(m.cfg, "g_qk", None))

# check actual weight scale
w = m.Wv[0].weight.detach().flatten()
print("Wv std =", w.std(unbiased=False).item(),
      "target ~", float(cfg.extra["model_kwargs"]["g_v"]) / math.sqrt(d_model))

# also check a deterministic hash-like scalar
print("Wv[0,0] =", m.Wv[0].weight.detach()[0,0].item())

id(m) = 2390815094816
m.cfg.g_v = 1.0 m.cfg.g_qk = 1.0


KeyError: 'g_v'