In [5]:
import torch
from transformers import BertModel, BertTokenizer
import json
import os

# 1. 加载 BERT 模型和 tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# 2. 定义一个字典来存储激活值
activations = {}

# 3. 定义钩子函数来捕获特定位置的激活值
def hook_fn(layer_name, target_token_idx):
    def hook(module, input, output):
        # 获取特定token位置的激活值
        # output的形状为 (batch_size, seq_len, hidden_size)
        # 我们选定目标token的位置target_token_idx
        target_activation = output.detach().cpu().numpy()[:, target_token_idx, :]
        activations[layer_name] = target_activation.tolist()
    return hook

# 4. 注册钩子：遍历所有 Transformer 层并注册钩子
hooks = []
input_text = "Hello, how are you?"

# 5. 将输入文本转换为token，并获取目标token的位置（如[CLS]或某个token）
inputs = tokenizer(input_text, return_tensors="pt")
tokens = tokenizer.tokenize(input_text)

# 假设我们想要获取第一个token（[CLS]）的位置
target_token_idx = 0  # [CLS]通常是第一个token

# 6. 注册钩子到每一层的 intermediate.dense（FFN部分）
for i, layer in enumerate(model.encoder.layer):
    hook = layer.intermediate.dense.register_forward_hook(hook_fn(f"layer_{i}_ffn", target_token_idx))
    hooks.append(hook)

# 7. 执行前向传播
outputs = model(**inputs)

# 8. 保存激活值到硬盘（保存为 JSON 格式）
save_dir = "./activations"
os.makedirs(save_dir, exist_ok=True)

# 保存路径
activation_file_path = os.path.join(save_dir, "activations.json")

# 将激活值字典保存为 JSON 文件
with open(activation_file_path, 'w') as json_file:
    json.dump(activations, json_file)

print(f"Activations saved to {activation_file_path}")

# 9. 移除钩子
for hook in hooks:
    hook.remove()


Activations saved to ./activations/activations.json


In [None]:
from datasets import load_dataset

ds = load_dataset("free-law/Caselaw_Access_Project")

  from .autonotebook import tqdm as notebook_tqdm
Downloading data: 100%|██████████| 58/58 [00:00<00:00, 85.60files/s]
Generating train split: 2566829 examples [01:39, 16833.30 examples/s]