In [1]:
import sys
from pathlib import Path

# 假设 notebook/ 在 "Deep Learning Pack/notebook"
ROOT = Path.cwd().parent  # -> Deep Learning Pack
sys.path.insert(0, str(ROOT))

import torch
from dlphys.config.base import ExperimentConfig
from dlphys.config.registry import build_model
import dlphys.models  # 关键：触发模型注册（side effect）

In [2]:
from dlphys.models.toy_attention import ToyAttentionConfig, ToyAttentionDynamics

cfg_m = ToyAttentionConfig(d_model=16, d_k=32, L=4, num_heads=1, gamma=0.5, phi="identity")
m = ToyAttentionDynamics(cfg_m)

s0 = torch.randn(2, cfg_m.L + 1, cfg_m.d_model)  # B=2
s1 = m(s0)

print("s0", s0.shape, "s1", s1.shape)
print("finite?", torch.isfinite(s1).all().item())

s0 torch.Size([2, 5, 16]) s1 torch.Size([2, 5, 16])
finite? True


In [3]:
import dlphys.models  # 重新 import 一次（如果你改完文件但 kernel 没重启，需要 reload）
from importlib import reload
reload(dlphys.models)

cfg = ExperimentConfig(
    project_name="toy",
    device="cpu",
    extra={
        "model_name": "toy_attention",
        "model_kwargs": dict(d_model=16, d_k=32, L=3, num_heads=1, gamma=0.5, phi="identity"),
    }
)

m = build_model(cfg)
print(type(m))

s0 = torch.randn(2, 4, 16)
s1 = m(s0)
s1.shape

<class 'dlphys.models.toy_attention.ToyAttentionDynamics'>


torch.Size([2, 4, 16])

In [7]:
from dlphys.core.run_dynamics import rollout
import torch

cfg = ExperimentConfig(
    project_name="toy",
    device="cpu",
    extra={
        "model_name": "toy_attention",
        "model_kwargs": dict(d_model=16, d_k=32, L=5, num_heads=1, gamma=0.5, phi="identity"),
    }
)

s0 = torch.randn(2, 6, 16)
out = rollout(cfg, s0=s0, T=20)
len(out["states"]), out["states"][0].shape, out["states"][-1].shape

(21, torch.Size([2, 6, 16]), torch.Size([2, 6, 16]))

In [11]:
from dlphys.analysis.jvp import jvp_F
import torch

B = 1
d_model = 16

L = m.cfg.L                      # <-- 关键：从模型读
s = torch.randn(B, L+1, d_model)
v = torch.zeros_like(s)

# 扰动第 1 个 memory block（x_{t-1}）
v[:, 1, :] = torch.randn_like(v[:, 1, :])

jvp = jvp_F(lambda _s: m(_s), s, v)

# companion shift 预测：输出的第2个记忆位应该等于输入第1个记忆位
# （注意：你的 state 定义是 [x_t, x_{t-1}, x_{t-2}, ...]）
err = (jvp[:, 2, :] - v[:, 1, :]).norm() / (v[:, 1, :].norm() + 1e-12)
print("relative shift error:", err.item())

relative shift error: 0.0


In [89]:
import torch
import dlphys.models
from dlphys.config.registry import build_model
from dlphys.analysis.lyapunov import lyapunov_max_benettin
from dlphys.config.base import ExperimentConfig

L=500
cfg = ExperimentConfig(
    project_name="toy",
    device="cpu",
    extra={
        "model_name": "toy_attention",
        "model_kwargs": dict(d_model=16, d_k=32, L=L, num_heads=1, gamma=0.5, phi="identity"),
    }
)

m = build_model(cfg).eval()

# 初态：B=2 条轨迹
s0 = torch.randn(2, L+1, 16)

# 定义 F(s)=m(s)
F = lambda s: m(s)

out = lyapunov_max_benettin(F, s0, burn_in=500, T=1000, return_traj=True)

print("lambda_hat (per batch):", out["lambda_hat"])
print("lambda_mean:", out["lambda_mean"])
print("final_state shape:", out["final_state"].shape)

lambda_hat (per batch): tensor([-0.0027, -0.0027])
lambda_mean: tensor(-0.0027)
final_state shape: torch.Size([2, 501, 16])


In [87]:
import torch
import dlphys.models
from dlphys.config.registry import build_model
from dlphys.analysis.lyapunov import lyapunov_max_benettin
from dlphys.config.base import ExperimentConfig

L=500
cfg = ExperimentConfig(
    project_name="toy",
    device="cpu",
    extra={
        "model_name": "toy_attention",
        "model_kwargs": dict(d_model=16, d_k=32, L=L, num_heads=1, gamma=0., phi="identity"),
    }
)

m = build_model(cfg).eval()

# 初态：B=2 条轨迹
s0 = torch.randn(2, L+1, 16)

# 定义 F(s)=m(s)
F = lambda s: m(s)

out = lyapunov_max_benettin(F, s0, burn_in=500, T=1000, return_traj=True)

print("lambda_hat (per batch):", out["lambda_hat"])
print("lambda_mean:", out["lambda_mean"])
print("final_state shape:", out["final_state"].shape)

lambda_hat (per batch): tensor([-0.0036, -0.0026])
lambda_mean: tensor(-0.0031)
final_state shape: torch.Size([2, 501, 16])


In [81]:
import torch
from dlphys.analysis.lyapunov import lyapunov_max_benettin

# 线性 map: s_{t+1} = a s_t
a = 1.1
F = lambda s: a * s

s0 = torch.randn(2, L+1, 3)
out = lyapunov_max_benettin(F, s0, burn_in=50, T=200)
print(out["lambda_mean"], "target=", torch.tensor(a).log())

tensor(0.0953) target= tensor(0.0953)


In [85]:
import torch
from dlphys.analysis.jvp import jvp_F

# 1) 从模型里读维度，避免你手写错
B = 1
L = m.cfg.L
d_model = m.cfg.d_model

s = torch.randn(B, L+1, d_model)
v = torch.randn_like(s)
eps = 1e-4

F = lambda _s: m(_s)

# 2) JVP
jvp = jvp_F(F, s, v)

# 3) finite difference
fd = (F(s + eps*v) - F(s - eps*v)) / (2*eps)

rel = (jvp - fd).norm() / (fd.norm() + 1e-12)
print("JVP finite-diff relative error:", rel.item())

JVP finite-diff relative error: 0.00024629029212519526


# Test on Attention logits

In [7]:
import sys
from pathlib import Path
import torch

ROOT = Path.cwd().parent  # notebook/ 的 parent
sys.path.insert(0, str(ROOT))

import dlphys.models  # trigger registration
from dlphys.config.base import ExperimentConfig
from dlphys.config.registry import build_model
from dlphys.analysis.jvp import jvp_F
from dlphys.utils.seed import set_seed

set_seed(0, deterministic=True)

L = 200
d_model = 16
d_k = 32
gamma = 0.5

cfg = ExperimentConfig(
    project_name="toy",
    device="cpu",
    seed=0,
    deterministic=True,
    extra={
        "model_name": "toy_attention",
        "model_kwargs": dict(d_model=d_model, d_k=d_k, L=L, num_heads=1, gamma=gamma, phi="identity"),
    }
)
m = build_model(cfg).eval()

B = 1
s = torch.randn(B, L+1, d_model)

In [9]:
w = m.Wq[0].weight
print("Wq mean/std:", w.mean().item(), w.std(unbiased=False).item())
print("target std:", 1.0 / (m.cfg.d_model ** 0.5))

Wq mean/std: 0.0048353411257267 0.25518450140953064
target std: 0.25


In [13]:
from dlphys.analysis.lyapunov import lyapunov_max_benettin

# 用同一个初态（或拷贝一份）
s0 = s.clone()

F = lambda _s: m(_s)

out = lyapunov_max_benettin(
    F, s0,
    burn_in=200,
    T=800,
    return_traj=True
)

print("lambda_hat (per batch):", out["lambda_hat"])
print("lambda_mean:", out["lambda_mean"])

lambda_hat (per batch): tensor([-0.0010])
lambda_mean: tensor(-0.0010)


In [14]:
@torch.no_grad()
def sensitivity_profile(m, s, n_trials=5):
    """
    Returns: sens[ L+1 ] where sens[tau] = E || d x_next / d s_tau || (via JVP)
    """
    B, T, d = s.shape
    assert B == 1
    F = lambda _s: m(_s)  # returns full s_next

    sens = torch.zeros(T)
    for tau in range(T):
        vals = []
        for _ in range(n_trials):
            v = torch.zeros_like(s)
            v[:, tau, :] = torch.randn_like(v[:, tau, :])  # perturb only this slot
            jvp = jvp_F(F, s, v)  # shape [1,T,d]
            dxnext = jvp[:, 0, :]  # output token perturbation
            vals.append(dxnext.norm(dim=-1).item())
        sens[tau] = sum(vals) / len(vals)
    return sens

sens = sensitivity_profile(m, s, n_trials=3)
print("sens shape:", sens.shape)
print("sens (first 10):", sens[:10])
print("ratio max/min:", (sens.max()/ (sens.min()+1e-12)).item())

sens shape: torch.Size([201])
sens (first 10): tensor([0.8963, 0.0192, 0.0191, 0.0201, 0.0139, 0.0142, 0.0110, 0.0216, 0.0323,
        0.0132])
ratio max/min: 151.2091827392578


In [15]:
p = sens / (sens.sum() + 1e-12)
entropy = (-(p * (p + 1e-12).log()).sum()).item()
entropy_uniform = torch.log(torch.tensor(float(L+1))).item()
print("entropy proxy:", entropy, "uniform entropy:", entropy_uniform)

entropy proxy: 4.738344192504883 uniform entropy: 5.303304672241211


# Test on Logistical Map

In [91]:
import torch
from dlphys.analysis.lyapunov import lyapunov_max_benettin
from dlphys.analysis.jvp import jvp_F

def logistic_F(r: float):
    def F(x: torch.Tensor) -> torch.Tensor:
        return r * x * (1.0 - x)
    return F

# 用 Benettin (JVP) 算 lambda
def lyap_logistic_benettin(r: float, B=32, burn_in=1000, T=5000, seed=0):
    torch.manual_seed(seed)
    # 初态 x0 in (0,1)
    x0 = torch.rand(B, 1)  # shape [B,1]
    F = logistic_F(r)
    out = lyapunov_max_benettin(F, x0, burn_in=burn_in, T=T, return_traj=False)
    return out["lambda_mean"].item()

In [93]:
@torch.no_grad()
def lyap_logistic_direct(r: float, B=32, burn_in=1000, T=5000, seed=0):
    torch.manual_seed(seed)
    x = torch.rand(B)  # [B]
    # burn-in
    for _ in range(burn_in):
        x = r * x * (1 - x)
    # average log |f'(x)|
    acc = 0.0
    for _ in range(T):
        # derivative
        d = torch.abs(r * (1 - 2*x)) + 1e-12
        acc += torch.log(d).mean().item()
        x = r * x * (1 - x)
    return acc / T

In [95]:
for r in [2.5, 3.2, 3.5, 3.8, 4.0]:
    lam_b = lyap_logistic_benettin(r, B=64, burn_in=2000, T=8000, seed=0)
    lam_d = lyap_logistic_direct(r, B=64, burn_in=2000, T=8000, seed=0)
    print(f"r={r:>3}:  benettin={lam_b:+.4f},  direct={lam_d:+.4f},  diff={lam_b-lam_d:+.4e}")

r=2.5:  benettin=-0.6931,  direct=-0.6931,  diff=-8.9407e-08
r=3.2:  benettin=-0.9163,  direct=-0.9163,  diff=+2.9802e-08
r=3.5:  benettin=-0.8726,  direct=-0.8726,  diff=-1.4901e-08
r=3.8:  benettin=+0.4319,  direct=+0.4319,  diff=-5.3402e-07
r=4.0:  benettin=+0.8223,  direct=+0.8223,  diff=-8.0016e-08


In [97]:
def check_jvp_logistic(r=3.8, eps=1e-6):
    F = logistic_F(r)
    x = torch.rand(1, 1)           # [1,1]
    v = torch.randn_like(x)        # [1,1]

    jvp = jvp_F(F, x, v)
    analytic = (r * (1 - 2*x)) * v

    rel1 = (jvp - analytic).abs().max() / (analytic.abs().max() + 1e-12)

    fd = (F(x + eps*v) - F(x - eps*v)) / (2*eps)
    rel2 = (jvp - fd).abs().max() / (fd.abs().max() + 1e-12)

    print("JVP vs analytic rel err:", rel1.item())
    print("JVP vs finite-diff rel err:", rel2.item())

check_jvp_logistic(3.8)

JVP vs analytic rel err: 0.0
JVP vs finite-diff rel err: 0.006280352361500263
