In [1]:
# ======================================================================
#  Chain-of-Thought category probe – notebook driver
# ======================================================================
# 1.  Global configuration ------------------------------------------------
%cd ../..
%pwd
from pathlib import Path

MODEL_PATH      = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"      # HF hub or local dir
GENERAL_DIR = Path("c_cluster_analysis/outputs/hints/mmlu/DeepSeek-R1-Distill-Llama-8B")
CATEGORY_FILE   = Path(GENERAL_DIR / "confidence" / "none_unverb_5001.json")                    # ↳ annotation JSON
COT_FILE   = Path(GENERAL_DIR / "orig" / "none_5001.json")                    # ↳ annotation JSON
MAIN_CATEGORIES = ["backtracking", "logical_deduction"]           # target label(s)
LAYERS          = list(range(1, 33, 5))                           # every 5-th layer
MAX_SAMPLES     = None                                            # or e.g. 200
WHITELIST       = None                                            # path to JSON list of q-ids
CAPTURE_FILE   = Path(GENERAL_DIR / "layprobe" / "none_unverb_5001.json")                    # ↳ annotation JSON
CAPTURE_FILE    = Path("outputs/hidden_capture.json")             # raw vectors
ATTRVEC_DIR   = Path(GENERAL_DIR / "attr_vecs")                    # ↳ annotation JSON

# 2.  Imports & helpers ---------------------------------------------------
import json, logging
logging.basicConfig(level=logging.INFO)

from c_cluster_analysis.cat_probe_5.cot_probe_utils import (
    load_model_and_tokenizer,
    gather_category_sentences,
    run_probe_capture_for_categories,
    train_linear_probes,
    save_attribute_vectors,
)

# 3.  Model / tokenizer ---------------------------------------------------
model, tok, _, _ = load_model_and_tokenizer(MODEL_PATH)

  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]
  from .autonotebook import tqdm as notebook_tqdm


/root/CoTFaithChecker


INFO:root:Loading deepseek-ai/DeepSeek-R1-Distill-Llama-8B on cuda
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]


In [2]:
# 4.  Build the sentence-selection map ------------------------------------
selection_map = gather_category_sentences(
    CATEGORY_FILE,
    main_categories = MAIN_CATEGORIES,
    whitelist       = WHITELIST,
    max_samples     = MAX_SAMPLES,
)

print(f"Capturing {sum(len(v) for v in selection_map.values())} "
      f"sentences across {len(selection_map)} questions")

# 5.  Hidden-state capture -------------------------------------------------
captured = run_probe_capture_for_categories(
    model              = model,
    tok                = tok,
    cot_file           = COT_FILE,
    selection_map      = selection_map,
    layers             = LAYERS,
    output_file        = CAPTURE_FILE,
)

# 6.  Linear-probe training -----------------------------------------------
probes, metrics = train_linear_probes(
    captured["vectors"],
    captured["labels"],
    test_size      = 0.2,
    random_state   = 42,
)

print("\n=== Probe results (weighted F1) ===")
for ln in sorted(metrics):
    print(f"{ln:>8}:  acc {metrics[ln]['accuracy']:.3f}   "
          f"f1 {metrics[ln]['f1']:.3f}")

# 7.  (optional) save attribute vectors ------------------------------------
save_attribute_vectors(captured["attr_vecs"], ATTRVEC_DIR)
print(f"\nFinished – vectors in {CAPTURE_FILE}")

Capturing 2515 sentences across 198 questions


capturing: 100%|██████████| 198/198 [09:49<00:00,  2.98s/q]
INFO:root:Saved capture to outputs/hidden_capture.json
training probes: 100%|██████████| 7/7 [05:14<00:00, 44.91s/layer]


=== Probe results (weighted F1) ===
 layer_1:  acc 0.946   f1 0.936
layer_11:  acc 0.968   f1 0.966
layer_16:  acc 0.974   f1 0.973
layer_21:  acc 0.974   f1 0.973
layer_26:  acc 0.970   f1 0.969
layer_31:  acc 0.968   f1 0.966
 layer_6:  acc 0.966   f1 0.964

Finished – vectors in outputs/hidden_capture.json





In [11]:
# ╔════════════════════════════════════════════════════════════════════╗
# ║  Steering experiment                                               ║
# ╚════════════════════════════════════════════════════════════════════╝
from pathlib import Path

from c_cluster_analysis.cat_probe_5.cot_steer_utils4 import (
    load_attr_vectors,
    run_steering_experiment,
)

# ─── user-editable parameters ──────────────────────────────────────────
CAT_FROM         = "backtracking"          # steer *with* this
CAT_TO           = "logical_deduction"     # steer *into* this
ALPHAS           = [0.0, 0.3, 0.6, 1.0, 10.0]    # 0 → no steering
ATTR_VEC_DIR     = GENERAL_DIR / "attr_vecs"           # from probe stage
QUESTIONS_FILE   = "data/mmlu/input_mcq_data.json"      # raw questions
HINTS_FILE       = None
#HINTS_FILE = "data/mmlu/hints_sycophancy.json"
FULL_COT_FILE    = COT_FILE                            # same as before
OUTPUT_STEER_JSON= GENERAL_DIR / "steering" / f"{CAT_FROM}_to_{CAT_TO}.json"
LAYERS_FOR_STEER = ["layer_11"]                        # e.g. last layer
MAX_QUESTIONS    = 5                                   # shorten dev run
# ───────────────────────────────────────────────────────────────────────

attr_vecs = load_attr_vectors(ATTR_VEC_DIR)
steer_vec = {ln: attr_vecs[CAT_FROM][ln] for ln in LAYERS_FOR_STEER}

steer_results = run_steering_experiment(
    model              = model,
    tok                = tok,
    steer_vectors      = steer_vec,
    cat_target         = CAT_TO,
    alphas             = ALPHAS,
    questions_file     = QUESTIONS_FILE,
    hints_file         = HINTS_FILE,
    full_cot_file      = FULL_COT_FILE,
    output_file        = OUTPUT_STEER_JSON,
    max_questions      = MAX_QUESTIONS,
)

print(f"Saved steered generations to {OUTPUT_STEER_JSON}")


ModuleNotFoundError: No module named 'steer_mod'

In [5]:
print("Available categories:", list(attr_vecs.keys()))
%ls c_cluster_analysis/outputs/hints/mmlu/DeepSeek-R1-Distill-Llama-8B/attr_vecs/none_unverb_5001.json

Available categories: []
backtracking.pt  logical_deduction.pt


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
