In [1]:
%cd ../..
%pwd
import os, json, math, itertools, random, logging, pathlib, re, time, sys
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, precision_recall_fscore_support
)
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'

# === Paths & constants =======================================================
MODEL_NAME        = "DeepSeek-R1-Distill-Llama-8B"
MODEL_PATH        = "deepseek-ai/" + MODEL_NAME
INCOUNT   = 2001                   # keep consistent with earlier run
DATA_DIR          = Path("data/mmlu")
#DATA_DIR       = Path("c_cluster_analysis/outputs/hints/mmlu") / MODEL_NAME / "cat_probe"
OUT_GEN       = Path("c_cluster_analysis/outputs/hints/mmlu/DeepSeek-R1-Distill-Llama-8B")
OUT_DIR       = OUT_GEN / "catprob"
QUESTIONS_JSON    = DATA_DIR / "input_mcq_data.json"
FULL_COT_JSON     = DATA_DIR / MODEL_NAME / "none" / f"completions_with_{INCOUNT}.json"

TARGET_CAT        = "backtracking"         # ⬅ pick any of the 12 categories
LAYER_STRIDE      = 5                      # every ~5 layers
CAPTURE_OUTFILE   = OUT_DIR / f"hidden_layers_stride{LAYER_STRIDE}.json"
ATTRIBUTE_VEC_OUT = OUT_DIR / f"attrvec_{TARGET_CAT}.pt"

CATEGORY_NAMES = [
    "problem_restating","knowledge_augmentation","assumption_validation",
    "logical_deduction","option_elimination","uncertainty_or_certainty_expression",
    "backtracking","forward_planning","decision_confirmation",
    "answer_reporting","option_restating","other",
]
target_id = CATEGORY_NAMES.index(TARGET_CAT)


/root/CoTFaithChecker


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
# 1. Hidden-state capture (only runs once)
from c_cluster_analysis.cat_probe_3after.probe_capture_general import (
    run_probe_capture
)

if not CAPTURE_OUTFILE.exists():
    n_layers = 32   # DeepSeek-R1-Distill-Llama-8B
    layers = list(range(1, n_layers + 1, LAYER_STRIDE))  # 1-based indices
    print("Capturing layers:", layers)

    _ = run_probe_capture(
        model_path     = MODEL_PATH,
        questions_file = str(QUESTIONS_JSON),
        full_cot_file  = str(FULL_COT_JSON),
        output_file    = str(CAPTURE_OUTFILE),
        layers         = layers,                  # <-- key change
        keep_attn_and_logits = True,              # so we can inspect later
        max_questions = 2,
    )
else:
    print("Using cached capture at", CAPTURE_OUTFILE)


  from .autonotebook import tqdm as notebook_tqdm


Using cached capture at c_cluster_analysis/outputs/hints/mmlu/DeepSeek-R1-Distill-Llama-8B/catprob/hidden_layers_stride5.json


In [3]:
# 2. Load capture → DataFrame
def load_capture(path: Path) -> pd.DataFrame:
    rows = []
    for obj in json.loads(path.read_text()):
        qid = obj["question_id"]
        for sent in obj["sentences"]:
            base = {
                "question_id": qid,
                "sentence_id": sent["sentence_id"],
                "sentence": sent["sentence"],
            }
            # one column per captured layer
            for k, v in sent["pooled_hs"].items():   # k=="layer_5", …
                base[k] = np.array(v, dtype=np.float32)
            # keep extra diagnostics if present
            for extra in ("answer_logits","avg_att_prompt"):
                if extra in sent: base[extra] = sent[extra]
            rows.append(base)
    return pd.DataFrame(rows)

capt_df = load_capture(CAPTURE_OUTFILE)
print("Loaded", len(capt_df), "sentences")

# merge category annotations ---------------------------------------------------
CAT_ANN_FILE = OUT_GEN / "confidence" / "sycophancy_unverb_2001.json"
def load_categories(path: Path) -> pd.DataFrame:
    rows = []
    for obj in json.loads(path.read_text()):
        qid = obj["question_id"]
        for ann in obj["annotations"]:
            rec = {"question_id": qid, "sentence_id": ann["sentence_id"]}
            rec.update({c: ann[c] for c in CATEGORY_NAMES})
            rows.append(rec)
    return pd.DataFrame(rows)
cats_df = load_categories(CAT_ANN_FILE)
df = capt_df.merge(cats_df, on=["question_id","sentence_id"])
print("After merge:", df.shape)


Loaded 109 sentences
After merge: (0, 24)


In [4]:
# 3. Probe per layer (binary: TARGET_CAT vs rest)
layer_cols = sorted([c for c in df.columns if c.startswith("layer_")],
                    key=lambda x: int(x.split("_")[1]))
print("Layers:", layer_cols)

metrics = []
for col in layer_cols:
    X = np.stack(df[col].values)
    y = (df[TARGET_CAT]==1).astype(int).values

    idx_train, idx_test = train_test_split(
        np.arange(len(y)), test_size=0.2, stratify=y, random_state=42)
    clf = LogisticRegression(max_iter=500, solver="lbfgs")
    clf.fit(X[idx_train], y[idx_train])

    y_pred = clf.predict(X[idx_test])
    acc  = accuracy_score(y[idx_test], y_pred)
    f1   = f1_score     (y[idx_test], y_pred)
    metrics.append((col, acc, f1, clf))

# Plot F1 vs layer
plt.figure(figsize=(6,3))
plt.plot([int(c.split("_")[1]) for c,_,_,_ in metrics],
         [m[2] for m in metrics], marker="o")
plt.xlabel("Layer"); plt.ylabel("F1"); plt.title(f"{TARGET_CAT} probe")
plt.ylim(0,1); plt.grid(); plt.show()

best_layer, best_acc, best_f1, best_clf = max(metrics, key=lambda t: t[2])
print(f"▶ Best: {best_layer} – acc {best_acc:.3f}  F1 {best_f1:.3f}")


Layers: ['layer_1', 'layer_6', 'layer_11', 'layer_16', 'layer_21', 'layer_26', 'layer_31']


ValueError: need at least one array to stack

In [None]:
# 4. Build & store attribute / steering vector
best_vecs      = np.stack(df[best_layer].values)
mask_target    = (df[TARGET_CAT]==1).values
attr_vec       = best_vecs[mask_target].mean(0) - best_vecs[~mask_target].mean(0)

torch.save(torch.tensor(attr_vec, dtype=torch.float32), ATTRIBUTE_VEC_OUT)
print("Saved attribute vector →", ATTRIBUTE_VEC_OUT)


In [None]:
# 5. Steering demo: inject at generation time
from transformers import AutoModelForCausalLM, AutoTokenizer

tok = AutoTokenizer.from_pretrained(MODEL_PATH)
mdl = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH, torch_dtype=torch.bfloat16, device_map="auto")
mdl.eval()

layer_idx = int(best_layer.split("_")[1])   # 1-based
alpha = 2.5                                 # steering strength (try 0-5)

attr_t = torch.tensor(attr_vec, device=mdl.device, dtype=mdl.dtype)

def add_attrvec_hook(module, input, output):
    # output: (batch, seq, hidden)
    return output + alpha * attr_t

handle = mdl.model.layers[layer_idx-1].register_forward_hook(add_attrvec_hook)

prompt = "Explain why the sky appears blue during the day."
ids = tok.encode(prompt, return_tensors="pt").to(mdl.device)
out = mdl.generate(ids, max_new_tokens=120, temperature=0.7)
print(tok.decode(out[0], skip_special_tokens=True))

handle.remove()   # clean up
