In [1]:
# ======================================================================
#  Chain-of-Thought category probe – notebook driver
# ======================================================================
# 1.  Global configuration ------------------------------------------------
%cd ../..
%pwd
from pathlib import Path

MODEL_PATH      = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"      # HF hub or local dir
GENERAL_DIR = Path("c_cluster_analysis/outputs/hints/mmlu/DeepSeek-R1-Distill-Llama-8B")
CATEGORY_FILE   = Path(GENERAL_DIR / "confidence" / "none_unverb_5001.json")                    # ↳ annotation JSON
COT_FILE   = Path(GENERAL_DIR / "orig" / "none_5001.json")                    # ↳ annotation JSON
MAIN_CATEGORIES = ["backtracking", "logical_deduction"]           # target label(s)
LAYERS          = list(range(1, 33, 5))                           # every 5-th layer
MAX_SAMPLES     = None                                            # or e.g. 200
WHITELIST       = None                                            # path to JSON list of q-ids
CAPTURE_FILE   = Path(GENERAL_DIR / "layprobe" / "none_unverb_5001.json")                    # ↳ annotation JSON
CAPTURE_FILE    = Path("outputs/hidden_capture.json")             # raw vectors
ATTRVEC_DIR   = Path(GENERAL_DIR / "attr_vecs")                    # ↳ annotation JSON

# 2.  Imports & helpers ---------------------------------------------------
import json, logging
logging.basicConfig(level=logging.INFO)

from c_cluster_analysis.cat_probe_5.cot_probe_utils import (
    load_model_and_tokenizer,
    gather_category_sentences,
    run_probe_capture_for_categories,
    train_linear_probes,
    save_attribute_vectors,
)

# 3.  Model / tokenizer ---------------------------------------------------
model, tok, _, _ = load_model_and_tokenizer(MODEL_PATH)

  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]
  from .autonotebook import tqdm as notebook_tqdm


/root/CoTFaithChecker


INFO:root:Loading deepseek-ai/DeepSeek-R1-Distill-Llama-8B on cuda
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]


In [2]:
# ╔══════════════════════════════════════════════════════════════════╗
# ║  Runtime stride checker – install on **all** decoder layers      ║
# ╚══════════════════════════════════════════════════════════════════╝
def install_stride_checkers(model, every_n_tokens=1):
    """
    Prints the hidden-state shape/stride that *leaves* each decoder layer.
    Any layer whose last-dimension ≠ model.config.hidden_size (4096) or whose
    tensor is not contiguous will be reported **once**.

    `every_n_tokens` lets you throttle the prints if generation is long.
    """
    bad_layers = set()
    step_cnt   = {"tok": 0}        # mutable so inner fn can modify

    def mk_hook(idx):
        def hook(mod, _inp, out):
            step_cnt["tok"] += 1
            if step_cnt["tok"] % every_n_tokens:
                return

            t = out[0] if isinstance(out, tuple) else out
            if t.size(-1) != model.config.hidden_size or not t.is_contiguous():
                if idx not in bad_layers:                # report once / layer
                    bad_layers.add(idx)
                    print(f"[stride-ERR] after layer_{idx+1:02d}: "
                          f"shape={tuple(t.shape)} stride={t.stride()} "
                          f"contiguous={t.is_contiguous()}")
        return hook

    handles = []
    for i, layer in enumerate(model.model.layers):
        handles.append(layer.register_forward_hook(mk_hook(i)))
    print(f"✔ stride checkers installed on {len(handles)} layers")
    return handles


In [3]:
# ╔══════════════════════════════════════════════════════════════════╗
# ║  Steering experiment (fixed)                                     ║
# ╚══════════════════════════════════════════════════════════════════╝
from c_cluster_analysis.cat_probe_5.cot_steer_utils5 import (
    load_attr_vectors,
    run_steering_experiment,
)

CAT_FROM  = "backtracking"
CAT_TO    = "logical_deduction"
ALPHAS    = [0.0, 0.3, 0.6, 1.0, 10.0]

ATTR_VEC_DIR   = GENERAL_DIR / "attr_vecs"
QUESTIONS_FILE = "data/mmlu/input_mcq_data.json"
HINTS_FILE     = None        # or a path to your hints file
OUTPUT_STEER   = GENERAL_DIR / "steering" / f"{CAT_FROM}_to_{CAT_TO}.json"
LAYERS_TO_USE  = ["layer_11"]      # where the probe worked best
MAX_Q          = 5                 # dev run

attr_vecs  = load_attr_vectors(ATTR_VEC_DIR)
steer_vecs = {ln: attr_vecs[CAT_FROM][ln] for ln in LAYERS_TO_USE}

handles_dbg = install_stride_checkers(model)   # ← install

# ---- now run just ONE question with the smallest alpha list ----
try:
    _ = run_steering_experiment(
            model=model, tok=tok,
            steer_vectors=steer_vecs,
            cat_target=CAT_TO,
            alphas=[0.0, 1.0],       # keep short
            questions_file=QUESTIONS_FILE,
            full_cot_file=COT_FILE,
            output_file="debug.json",
            max_questions=1,
        )
finally:                          # always clean up
    for h in handles_dbg:
        h.remove()


print(f"\n✔ Done – generations stored in:  {OUTPUT_STEER}")


  out[cat] = torch.load(fp, map_location="cpu")


✔ stride checkers installed on 32 layers


steering:   0%|          | 0/1 [00:01<?, ?q/s]


RuntimeError: The size of tensor a (32) must match the size of tensor b (128) at non-singleton dimension 3

In [3]:
def install_debug_hook(layer_idx=10):        # 0-based
    def dbg(mod, inp, out):
        t = out[0] if isinstance(out, tuple) else out
        print(f"[dbg] layer {layer_idx}  shape {tuple(t.shape)}  stride {t.stride()}"
              f"  contiguous={t.is_contiguous()}")
    return model.model.layers[layer_idx].register_forward_hook(dbg)

handle_dbg = install_debug_hook(10)   # layer_11
# run one generation step to see the printout …
# afterwards:
handle_dbg.remove()


In [None]:
# ╔══════════════════════════════════════════════════════════╗
# ║  MINI-TEST – steering hook + RoPE reshape (bullet-proof) ║
# ╚══════════════════════════════════════════════════════════╝
import torch
import inspect
import importlib
from c_cluster_analysis.cat_probe_5 import cot_steer_utils4 as steer
importlib.reload(steer)                         # make sure it's current

# ── pick layer & steering vector ────────────────────────────
VEC   = attr_vecs["backtracking"]["layer_11"]   # 4096-d mean vector
LIDX  = 10                                      # zero-based → layer_11
layer = model.model.layers[LIDX]
print("layer.forward →", inspect.signature(layer.forward))

# ── robust fallback for (cos, sin) ──────────────────────────
def safe_rope_cache(seq_len: int, *, device, dtype):
    """Return (cos, sin) already broadcastable over heads."""
    n_heads  = model.config.num_attention_heads
    head_dim = model.config.hidden_size // n_heads
    cos = torch.ones(seq_len, n_heads, head_dim, device=device, dtype=dtype)
    sin = torch.zeros_like(cos)
    return cos, sin                               # shapes (S, H, D)

SEQ = 4
cos, sin = safe_rope_cache(SEQ, device=model.device, dtype=model.dtype)

# ── dummy input ─────────────────────────────────────────────
x = torch.randn(1, SEQ, model.config.hidden_size,
                device=model.device, dtype=model.dtype)

def fwd_once(inp, hook=None):
    h = hook and layer.register_forward_hook(hook)
    out = layer(
        hidden_states       = inp,
        attention_mask      = None,
        position_ids        = None,
        past_key_value      = None,
        position_embeddings = (cos, sin),   # <- always provided
        output_attentions   = False,
        use_cache           = False,
    )[0]                                         # (B, S, H)
    if h: h.remove()
    return out

# 1) plain forward ------------------------------------------------------
out_plain = fwd_once(x)
assert out_plain.shape == (1, SEQ, model.config.hidden_size)
print("✓ plain  forward ok :", out_plain.shape, out_plain.stride())

# 2) forward with steering hook ----------------------------------------
hook = steer._SteeringHook(VEC, alpha=1.0)
out_hook = fwd_once(x, hook)
assert out_hook.shape == out_plain.shape
print("✓ hooked forward ok :", out_hook.shape, out_hook.stride())

# 3) reshape exactly like attention does -------------------------------
q = layer.self_attn.q_proj(out_hook)             # (1,SEQ,4096)
q = q.view(1, SEQ,
           model.config.num_attention_heads,
           -1)                                   # (1,SEQ,32,128)
assert q.shape[-2:] == (model.config.num_attention_heads, 128)
print("✓ reshape  view ok  :", q.shape, "strides", q.stride())


layer.forward → (hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[transformers.cache_utils.Cache] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: typing_extensions.Unpack[transformers.modeling_flash_attention_utils.FlashAttentionKwargs]) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]


RuntimeError: The size of tensor a (4) must match the size of tensor b (32) at non-singleton dimension 2

In [5]:
att = model.model.layers[10].self_attn        # attention block
print("attrs with 'rot':", [n for n in dir(att) if "rot" in n])
for subname, submod in att.named_modules():
    if any("rot" in n for n in dir(submod)):
        print("found deeper at ⇒", subname, type(submod))
        print("   inner attrs:", [n for n in dir(submod) if "rot" in n])


attrs with 'rot': []


In [9]:
import c_cluster_analysis.cat_probe_5.cot_steer_utils4 as steer
print("using:", steer.__file__)


using: /root/CoTFaithChecker/c_cluster_analysis/cat_probe_5/cot_steer_utils4.py


In [5]:
vec = attr_vecs["backtracking"]["layer_11"]      # the one you load
print("vector dim:", vec.numel())                # → 1024  (wrong)
print("model hidden_size:", model.config.hidden_size)  # → 4096


vector dim: 4096
model hidden_size: 4096
