In [15]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
import zarr

*******************************************

***Notebook aimend at viewing the raw data structure***

---------------

In [16]:
run_dir = Path(r"traces\20251216-193911_gpt2_csqa_validation_n4")

meta = json.loads((run_dir / "meta.json").read_text(encoding="utf-8"))
print("meta keys:", list(meta.keys()))
print("model:", meta.get("model"), "dataset:", meta.get("dataset"), "split:", meta.get("split"))
print("n_examples:", meta.get("n_examples"), "max_seq_len:", meta.get("max_seq_len"))
print("num_layers:", meta.get("num_layers"), "num_heads:", meta.get("num_heads"), "dtype:", meta.get("dtype"))
print("capture:", meta.get("capture"))

meta keys: ['run_id', 'model', 'arch', 'dataset', 'split', 'n_examples', 'max_seq_len', 'num_layers', 'num_heads', 'head_dim', 'layers_stored', 'heads_stored', 'dtype', 'capture', 'has_targets', 'time']
model: gpt2 dataset: csqa split: validation
n_examples: 4 max_seq_len: 128
num_layers: 12 num_heads: 12 dtype: float16
capture: ['attn', 'qkv', 'hidden', 'resid']


In [17]:
df = pd.read_parquet(run_dir / "tokens.parquet")
print("rows:", len(df))
print("cols:", list(df.columns))
print(df.head(4)[["example_id", "answerKey"]])

rows: 4
cols: ['example_id', 'text', 'input_ids', 'attention_mask', 'offset_mapping', 'tokens', 'answerKey', 'csqa_choices']
                         example_id answerKey
0  1afa02df02c908a558b4036e80242fac         A
1  a7ab086045575bb497933726e4e6ad28         A
2  b8c0a4703079cf661d7261a60a1bcbff         B
3  e68fb2448fd74e402aae9982aa76e527         A


In [18]:
pd.set_option("display.max_colwidth", 400)
for i in range(min(4, len(df))):
    txt = df.loc[i, "text"]
    print("\nIDX:", i, "example_id:", df.loc[i, "example_id"])
    print("text len:", len(txt))
    print("text repr head:", repr(txt[:250]))
    print("text head:\n", txt[:400])



IDX: 0 example_id: 1afa02df02c908a558b4036e80242fac
text len: 123
text repr head: 'Q: A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?\nChoices:\nA:'
text head:
 Q: A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?
Choices:
A:

IDX: 1 example_id: a7ab086045575bb497933726e4e6ad28
text len: 48
text repr head: 'Q: What do people aim to do at work?\nChoices:\nA:'
text head:
 Q: What do people aim to do at work?
Choices:
A:

IDX: 2 example_id: b8c0a4703079cf661d7261a60a1bcbff
text len: 82
text repr head: 'Q: Where would you find magazines along side many other printed works?\nChoices:\nA:'
text head:
 Q: Where would you find magazines along side many other printed works?
Choices:
A:

IDX: 3 example_id: e68fb2448fd74e402aae9982aa76e527
text len: 57
text repr head: 'Q: Where are  you likely to find a hamburger?\nChoices:\nA:'
text head:
 Q: Where are  you likely to find a ha

In [19]:
pd.set_option("display.max_colwidth", 400)
for i in range(min(4, len(df))):
    txt = df.loc[i, "text"]
    print("\nIDX:", i, "example_id:", df.loc[i, "example_id"])
    print("text len:", len(txt))
    print("text repr head:", repr(txt[:250]))
    print("text head:\n", txt[:400])



IDX: 0 example_id: 1afa02df02c908a558b4036e80242fac
text len: 123
text repr head: 'Q: A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?\nChoices:\nA:'
text head:
 Q: A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?
Choices:
A:

IDX: 1 example_id: a7ab086045575bb497933726e4e6ad28
text len: 48
text repr head: 'Q: What do people aim to do at work?\nChoices:\nA:'
text head:
 Q: What do people aim to do at work?
Choices:
A:

IDX: 2 example_id: b8c0a4703079cf661d7261a60a1bcbff
text len: 82
text repr head: 'Q: Where would you find magazines along side many other printed works?\nChoices:\nA:'
text head:
 Q: Where would you find magazines along side many other printed works?
Choices:
A:

IDX: 3 example_id: e68fb2448fd74e402aae9982aa76e527
text len: 57
text repr head: 'Q: Where are  you likely to find a hamburger?\nChoices:\nA:'
text head:
 Q: Where are  you likely to find a ha

In [20]:
for i in range(min(4, len(df))):
    toks = df.loc[i, "tokens"]
    ids = df.loc[i, "input_ids"]
    am = df.loc[i, "attention_mask"]
    print("\nIDX:", i)
    print("types:", type(toks), type(ids), type(am))
    print("len tokens:", len(toks), "len ids:", len(ids), "len mask:", len(am))
    print("mask sum:", int(np.sum(am)))
    print("first 40 tokens:", toks[:40])
    print("first 40 ids:", ids[:40])
    k = int(np.sum(am))
    print("last 40 tokens (active):", toks[max(0, k-40):k])
    print("last 40 ids (active):", ids[max(0, k-40):k])


IDX: 0
types: <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
len tokens: 128 len ids: 128 len mask: 128
mask sum: 31
first 40 tokens: ['<|endoftext|>' '<|endoftext|>' '<|endoftext|>' '<|endoftext|>'
 '<|endoftext|>' '<|endoftext|>' '<|endoftext|>' '<|endoftext|>'
 '<|endoftext|>' '<|endoftext|>' '<|endoftext|>' '<|endoftext|>'
 '<|endoftext|>' '<|endoftext|>' '<|endoftext|>' '<|endoftext|>'
 '<|endoftext|>' '<|endoftext|>' '<|endoftext|>' '<|endoftext|>'
 '<|endoftext|>' '<|endoftext|>' '<|endoftext|>' '<|endoftext|>'
 '<|endoftext|>' '<|endoftext|>' '<|endoftext|>' '<|endoftext|>'
 '<|endoftext|>' '<|endoftext|>' '<|endoftext|>' '<|endoftext|>'
 '<|endoftext|>' '<|endoftext|>' '<|endoftext|>' '<|endoftext|>'
 '<|endoftext|>' '<|endoftext|>' '<|endoftext|>' '<|endoftext|>']
first 40 ids: [50256 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256
 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256
 50256 50256 50256 50256 5025

In [21]:
zarr_paths = sorted(run_dir.glob("*.zarr"))
print("zarr stores:", [p.name for p in zarr_paths])

zarr stores: ['dec_hidden.zarr', 'dec_res_embed.zarr', 'dec_res_post_attn.zarr', 'dec_res_post_mlp.zarr', 'dec_res_pre_attn.zarr', 'dec_self_attn.zarr', 'dec_self_qkv.zarr']


In [22]:
def describe_store(p: Path):
    root = zarr.open(str(p), mode="r")
    if hasattr(root, "array_keys"):
        keys = list(root.array_keys())
        print("\n", p.name, "keys:", keys)
        for k in keys:
            a = root[k]
            print(" ", k, "shape", a.shape, "chunks", a.chunks, "dtype", a.dtype)
    else:
        print("\n", p.name, "array shape", root.shape, "chunks", root.chunks, "dtype", root.dtype)

for p in zarr_paths:
    describe_store(p)


 dec_hidden.zarr keys: ['h']
  h shape (4, 13, 128, 768) chunks (1, 1, 128, 768) dtype float16

 dec_res_embed.zarr keys: ['x']
  x shape (4, 128, 768) chunks (1, 128, 768) dtype float16

 dec_res_post_attn.zarr keys: ['x']
  x shape (4, 12, 128, 768) chunks (1, 1, 128, 768) dtype float16

 dec_res_post_mlp.zarr keys: ['x']
  x shape (4, 12, 128, 768) chunks (1, 1, 128, 768) dtype float16

 dec_res_pre_attn.zarr keys: ['x']
  x shape (4, 12, 128, 768) chunks (1, 1, 128, 768) dtype float16

 dec_self_attn.zarr keys: ['attn']
  attn shape (4, 12, 12, 128, 128) chunks (1, 1, 1, 128, 128) dtype float16

 dec_self_qkv.zarr keys: ['k', 'q', 'v']
  k shape (4, 12, 12, 128, 64) chunks (1, 1, 1, 128, 64) dtype float16
  q shape (4, 12, 12, 128, 64) chunks (1, 1, 1, 128, 64) dtype float16
  v shape (4, 12, 12, 128, 64) chunks (1, 1, 1, 128, 64) dtype float16


In [23]:
def zarr_slice(p: Path, key: str, slc):
    root = zarr.open(str(p), mode="r")
    arr = root[key]
    x = np.asarray(arr[slc])
    print(p.name, key, "slice", slc, "->", x.shape, x.dtype)
    print("min/mean/max:", float(np.min(x)), float(np.mean(x)), float(np.max(x)))

zarr_slice(run_dir / "dec_hidden.zarr", "h", (0, 0, slice(0, 3), slice(0, 8)))
zarr_slice(run_dir / "dec_res_pre_attn.zarr", "x", (0, 0, slice(0, 3), slice(0, 8)))
zarr_slice(run_dir / "dec_self_attn.zarr", "attn", (0, 0, 0, slice(0, 3), slice(0, 6)))

dec_hidden.zarr h slice (0, 0, slice(0, 3, None), slice(0, 8, None)) -> (3, 8) float16
min/mean/max: -0.25537109375 -0.047393798828125 0.10443115234375
dec_res_pre_attn.zarr x slice (0, 0, slice(0, 3, None), slice(0, 8, None)) -> (3, 8) float16
min/mean/max: -0.25537109375 -0.047393798828125 0.10443115234375
dec_self_attn.zarr attn slice (0, 0, 0, slice(0, 3, None), slice(0, 6, None)) -> (3, 6) float16
min/mean/max: 0.0 0.010009765625 0.03125


In [24]:
mask = df.loc[i, "attention_mask"]
ids = df.loc[i, "input_ids"]
toks = df.loc[i, "tokens"]

idx = np.where(mask == 1)[0]
print("active span:", idx[0], "→", idx[-1])
print("tokens:", toks[idx[0]:idx[-1]+1])


active span: 108 → 127
tokens: ['Q' ':' 'ĠWhere' 'Ġare' 'Ġ' 'Ġyou' 'Ġlikely' 'Ġto' 'Ġfind' 'Ġa' 'Ġhamb'
 'urger' '?' 'Ċ' 'Cho' 'ices' ':' 'Ċ' 'A' ':']


---------------------------------

tokenizer examination of trace lengths on raw dataset

----------------------

In [25]:
from datasets import load_dataset
from transformers import AutoTokenizer
import numpy as np

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

ds = load_dataset("commonsense_qa", split="validation")

LABELS = ["A","B","C","D","E"]

def build_prompt(ex):
    q = ex["question"]
    labs = ex["choices"]["label"]
    txts = ex["choices"]["text"]
    choices = "\n".join([f"{l}: {t}" for l, t in zip(labs, txts)])
    return f"Q: {q}\nChoices:\n{choices}\nAnswer:"

lens = []
lens_q = []
lens_choices = []
lens_prompt = []

bad = 0
for ex in ds:
    prompt = build_prompt(ex)

    ids_prompt = tokenizer(prompt, add_special_tokens=False)["input_ids"]
    ids_q = tokenizer("Q: " + ex["question"] + "\nChoices:\n", add_special_tokens=False)["input_ids"]
    ids_choices = tokenizer("\n".join([f"{l}: {t}" for l, t in zip(ex["choices"]["label"], ex["choices"]["text"])]) + "\nAnswer:", add_special_tokens=False)["input_ids"]

    Lp = len(ids_prompt)
    Lq = len(ids_q)
    Lc = len(ids_choices)

    lens.append(Lp)
    lens_q.append(Lq)
    lens_choices.append(Lc)
    lens_prompt.append(Lp)

    if Lp < 10:
        bad += 1

lens = np.array(lens)
lens_q = np.array(lens_q)
lens_choices = np.array(lens_choices)

def pct(arr):
    return {p: int(np.percentile(arr, p)) for p in [50, 75, 90, 95, 97, 99, 100]}

print("N:", len(ds), "bad prompts:", bad)
print("prompt token lengths percentiles:", pct(lens))
print("question+header lengths percentiles:", pct(lens_q))
print("choices+Answer lengths percentiles:", pct(lens_choices))

too_128 = int((lens > 128).sum())
too_256 = int((lens > 256).sum())
too_384 = int((lens > 384).sum())
print("exceed 128:", too_128, f"({too_128/len(lens)*100:.1f}%)")
print("exceed 256:", too_256, f"({too_256/len(lens)*100:.1f}%)")
print("exceed 384:", too_384, f"({too_384/len(lens)*100:.1f}%)")


  from .autonotebook import tqdm as notebook_tqdm


N: 1221 bad prompts: 0
prompt token lengths percentiles: {50: 46, 75: 51, 90: 57, 95: 60, 97: 63, 99: 69, 100: 97}
question+header lengths percentiles: {50: 21, 75: 25, 90: 30, 95: 34, 97: 37, 99: 43, 100: 72}
choices+Answer lengths percentiles: {50: 25, 75: 27, 90: 29, 95: 30, 97: 30, 99: 32, 100: 34}
exceed 128: 0 (0.0%)
exceed 256: 0 (0.0%)
exceed 384: 0 (0.0%)


In [26]:
idx_sorted = np.argsort(-lens)
for k in range(5):
    i = int(idx_sorted[k])
    ex = ds[i]
    prompt = build_prompt(ex)
    print("\n--- TOP", k, "idx", i, "prompt_len", int(lens[i]), "id", ex["id"], "---")
    print(prompt[:600])



--- TOP 0 idx 979 prompt_len 97 id 5efadabaf61b5174916e3ab659bcd283 ---
Q: Danny found that the carpet did not ,match the drapes, which was disappointing, because this place was expensive.  But it was the only place in town that wasn't booked solid for the week and he needed it while he was in town, so he couldn't complain.   Where might this place be?
Choices:
A: brothel
B: restaurant
C: building
D: bowling alley
E: at hotel
Answer:

--- TOP 1 idx 628 prompt_len 82 id 41bab71fea3fa04e5a4e10a2f86996df ---
Q: The architect thought that a mezzanine would look good, but the planning committee rejected it.  They told the architect that they felt it was a potential hazard given the ages of the people who would be using it.  What might they be designing?
Choices:
A: actors
B: theater
C: concert hall
D: floors
E: school
Answer:

--- TOP 2 idx 316 prompt_len 81 id d867f76d000bdb59b9b4cb982bd7f0a0 ---
Q: Joe was thrown from his boat into the water.  The water was cold because it was the middle

In [27]:
def count_options_in_prefix(prompt, max_len):
    ids = tokenizer(prompt, add_special_tokens=False)["input_ids"][:max_len]
    txt = tokenizer.decode(ids)
    return sum([f"\n{l}:" in txt or f"{l}:" in txt for l in LABELS]), txt

for max_len in [128, 256]:
    counts = []
    for ex in ds:
        prompt = build_prompt(ex)
        c, _ = count_options_in_prefix(prompt, max_len)
        counts.append(c)
    counts = np.array(counts)
    print("\nmax_len", max_len, "options visible counts:",
          {v: int((counts==v).sum()) for v in range(0,6)})



max_len 128 options visible counts: {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 1221}

max_len 256 options visible counts: {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 1221}


In [28]:
i = 0
mask = df.loc[i, "attention_mask"]
ids = df.loc[i, "input_ids"]
idx = np.where(mask == 1)[0]
active_ids = ids[idx[0]:idx[-1]+1].tolist()
print("active span:", int(idx[0]), "->", int(idx[-1]), "len:", len(active_ids))
print(tokenizer.decode(active_ids))


active span: 97 -> 127 len: 31
Q: A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?
Choices:
A:


In [29]:
for i in range(len(df)):
    v = df.loc[i, "csqa_choices"]
    print(i, type(v), "len:", (len(v) if hasattr(v, "__len__") else None), "value:", v)


0 <class 'numpy.ndarray'> len: 0 value: []
1 <class 'numpy.ndarray'> len: 0 value: []
2 <class 'numpy.ndarray'> len: 0 value: []
3 <class 'numpy.ndarray'> len: 0 value: []
