In [1]:
%cd ../..
%pwd

/root/CoTFaithChecker


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


'/root/CoTFaithChecker'

In [2]:
%ls h_hidden_space/outputs/f1_hint_xyyx/hints/mmlu/DeepSeek-R1-Distill-Llama-8B/*

h_hidden_space/outputs/f1_hint_xyyx/hints/mmlu/DeepSeek-R1-Distill-Llama-8B/induced_urgency:
[0m[01;34m500_captures[0m/

h_hidden_space/outputs/f1_hint_xyyx/hints/mmlu/DeepSeek-R1-Distill-Llama-8B/none:
[01;34m500_captures[0m/

h_hidden_space/outputs/f1_hint_xyyx/hints/mmlu/DeepSeek-R1-Distill-Llama-8B/sycophancy:
[01;34m500_captures[0m/

h_hidden_space/outputs/f1_hint_xyyx/hints/mmlu/DeepSeek-R1-Distill-Llama-8B/unethical_information:
[01;34m500_captures[0m/


In [3]:
import os, pickle, datetime, re
from pathlib import Path

import torch, numpy as np
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler

ROOT = Path("i_probe_steer/extractions/hints/mmlu/DeepSeek-R1-Distill-Llama-8B")
ROOT2 = Path("h_hidden_space/outputs/f1_hint_xyyx/hints/mmlu/DeepSeek-R1-Distill-Llama-8B/sycophancy")
DIR_NONE        = ROOT / "none/500_captures"
DIR_unverb  = ROOT2 / "500_captures"

def list_pt(directory):
    return sorted(str(p) for p in directory.glob("*.pt"))

files_none  = list_pt(DIR_NONE)
files_sync  = list_pt(DIR_unverb)
assert files_none and files_sync, "No .pt files found ! check paths"


In [4]:
pointer_re = re.compile(br"^version https://git-lfs.github.com/spec/")
def safe_torch_load(fname):
    with open(fname, "rb") as f:
        if pointer_re.match(f.read(80)):
            print(f"{Path(fname).name}: Git-LFS pointer, skipping")
            return None
    return torch.load(fname, map_location="cpu")


In [None]:
def dict_to_list(d):
    return [d[f"layer_{i}"] for i in range(len(d))]
def normalise_batch(obj):
    if isinstance(obj, dict):
        return dict_to_list(obj)
    elif isinstance(obj, (list, tuple)):
        return list(obj)
    raise TypeError(f"Unexpected batch type: {type(obj)}")

sample = None
for fp in files_none + files_sync:
    raw = safe_torch_load(fp)
    if raw is not None:
        sample = normalise_batch(raw)
        break

assert sample is not None, "No real .pt blobs present – run `git lfs pull`"
N_LAYERS     = len(sample)
HIDDEN_SIZE  = sample[0].shape[-1]
print(f"Detected {N_LAYERS} layers, hidden size = {HIDDEN_SIZE}")



Detected 33 layers, hidden size = 4096


  return torch.load(fname, map_location="cpu")


In [6]:
layer_blobs = {L: [] for L in range(N_LAYERS)}
labels      = []

def add_files(file_list, lab):
    for fp in tqdm(file_list, desc=f"label={lab}"):
        raw = safe_torch_load(fp)
        if raw is None:
            continue
        batch = normalise_batch(raw)
        B = batch[0].shape[0]
        for L, h in enumerate(batch):
            layer_blobs[L].append(h.float().numpy())
        labels.extend([lab]*B)

add_files(files_none,  0)   # 0 = none
add_files(files_sync,  1)   # 1 = unverb

assert labels, "No usable data loaded."
labels = np.asarray(labels, dtype=np.int8)
print("Total samples:", len(labels))

layer_X = {L: np.concatenate(layer_blobs[L], axis=0) for L in layer_blobs}
del layer_blobs

scalers = {}
for L in range(N_LAYERS):
    scaler = StandardScaler()
    layer_X[L] = scaler.fit_transform(layer_X[L])
    scalers[L] = scaler

  return torch.load(fname, map_location="cpu")
label=0: 100%|██████████| 16/16 [00:00<00:00, 85.17it/s]
label=1: 100%|██████████| 16/16 [00:00<00:00, 96.93it/s]


Total samples: 990


In [7]:

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
layer_scores = []

print("\nCross-validation accuracy:")
for L in range(N_LAYERS):
    clf = LogisticRegression(penalty="l2", C=1.0, max_iter=1000, n_jobs=-1)
    acc = cross_val_score(clf, layer_X[L], labels, cv=cv, scoring="accuracy").mean()
    print(f"layer {L:2d}:  {acc:.3f}")
    layer_scores.append(acc)

best_layer = int(np.argmax(layer_scores))
print(f"\First highest layer = {best_layer}  (acc = {layer_scores[best_layer]:.3f})")

# final fit on the best layer, save probe + Δµ
X_best  = layer_X[best_layer]
clf_best = LogisticRegression(penalty="l2", C=1.0, max_iter=1000, n_jobs=-1).fit(X_best, labels)

mu_none = X_best[labels == 0].mean(axis=0)
mu_sync = X_best[labels == 1].mean(axis=0)
delta_mu = mu_sync - mu_none

OUT = ROOT / f"unverb_probe_layer{best_layer}.pkl"
with open(OUT, "wb") as f:
    pickle.dump(
        dict(
            layer       = best_layer,
            weights     = clf_best.coef_[0].astype(np.float32),
            intercept   = float(clf_best.intercept_[0]),
            delta_mu    = delta_mu.astype(np.float32),
            hidden_size = HIDDEN_SIZE,
            created     = datetime.datetime.now().isoformat(timespec="seconds"),
            acc_cv      = float(layer_scores[best_layer]),
            note        = "0 = none, 1 = unverb; StandardScaler applied.",
        ),
        f,
    )
print(f"\nProbe saved to {OUT.relative_to(Path('.'))}")



Cross-validation accuracy:


layer  0:  0.665
layer  1:  0.977
layer  2:  0.981
layer  3:  0.992
layer  4:  0.993
layer  5:  0.993
layer  6:  0.994
layer  7:  0.996
layer  8:  0.999
layer  9:  0.999
layer 10:  0.999
layer 11:  0.999
layer 12:  0.999
layer 13:  1.000
layer 14:  1.000
layer 15:  1.000
layer 16:  1.000
layer 17:  1.000
layer 18:  0.999
layer 19:  0.999
layer 20:  0.998
layer 21:  0.998
layer 22:  0.998
layer 23:  0.998
layer 24:  0.998
layer 25:  0.999
layer 26:  0.998
layer 27:  0.998
layer 28:  0.998
layer 29:  0.997
layer 30:  0.997
layer 31:  0.998
layer 32:  0.997
\First highest layer = 13  (acc = 1.000)

Probe saved to i_probe_steer/extractions/hints/mmlu/DeepSeek-R1-Distill-Llama-8B/unverb_probe_layer13.pkl
