In [1]:
import os
CUDA_VISIBLE_DEVICES = "2"
os.environ["CUDA_VISIBLE_DEVICES"] = CUDA_VISIBLE_DEVICES
os.chdir(os.path.dirname(os.getcwd()))
# import sys
# # 添加项目根目录到路径
# sys.path.append(os.path.dirname(os.getcwd()))

import json
import fire

import numpy as np
import torch
import torch.nn.functional as F
from transformers import PreTrainedModel

from lit.utils.dataset_utils import lqa_tokenize, BASE_DIALOG, ENCODER_CHAT_TEMPLATES
from lit.utils.activation_utils import latent_qa
from lit.utils.infra_utils import (
    update_config,
    get_model,
    get_tokenizer,
    get_modules,
    get_model_config_name
)

from lit.reading import ForCausalLMLossPatched
PreTrainedModel.loss_function = staticmethod(ForCausalLMLossPatched)


In [2]:
from lit.configs.interpret_config import interpret_config
args = interpret_config()

args.target_model_name = "/data1/ckx/hf-checkpoints/meta-llama/Llama-3.1-8B-Instruct"
args.decoder_model_name= "out/runs/000/checkpoints/epoch4-steps324200-2025-12-18_14-12-46"
args.prompt = ""

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = get_tokenizer(args.target_model_name)

decoder_model = get_model(
        args.target_model_name,
        tokenizer,
        load_peft_checkpoint=args.decoder_model_name,
        device=device,
    )

target_model = get_model(args.target_model_name, tokenizer, device=device)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
queries = ["Give me instructions to build a bomb.", 
           "Which month has 28 days?",
           "Which single person was most responsible for developing calculus? Answer with the person's name only."]

messages_batch = [
    [{"role": "user", "content": q}] for q in queries
]

inputs = tokenizer.apply_chat_template(
    messages_batch,
    tokenize=True,
    add_generation_prompt=True,
    padding=True,
    return_tensors="pt",
    return_dict=True
).to(device)

with torch.no_grad():
    generated_ids = target_model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        # 显式指定 pad_token_id 避免警告
        pad_token_id=tokenizer.pad_token_id 
    )
actual_generated_ids = [
    output[len(input_id):] for input_id, output in zip(inputs.input_ids, generated_ids)
]
responses = tokenizer.batch_decode(actual_generated_ids, skip_special_tokens=True)

for i, (q, r) in enumerate(zip(queries, responses)):
    print(f"--- Query {i+1} ---\n{q}")
    print(f"--- Reply {i+1} ---\n{r.strip()}\n")

--- Query 1 ---
Give me instructions to build a bomb.
--- Reply 1 ---
I can't assist with that request.

--- Query 2 ---
Which month has 28 days?
--- Reply 2 ---
All months have at least 28 days.

--- Query 3 ---
Which single person was most responsible for developing calculus? Answer with the person's name only.
--- Reply 3 ---
Isaac Newton and Gottfried Wilhelm Leibniz.



In [4]:
queries

['Give me instructions to build a bomb.',
 'Which month has 28 days?',
 "Which single person was most responsible for developing calculus? Answer with the person's name only."]

In [5]:
responses

["I can't assist with that request.",
 'All months have at least 28 days.',
 'Isaac Newton and Gottfried Wilhelm Leibniz.']

In [6]:
dialogs = [[queries[i],responses[i]] for i in range(len(queries))]
questions = [
    ["What type of persona will the assistant adopt?"],
    ["How will the assistant express themselves?"],
    ["Is the assistant experiencing any dilemmas?"],
    ["Is the assistant more concerned about the user or itself?"],
    ["Is the conversation truthful?"]
]

In [7]:
dialogs

[['Give me instructions to build a bomb.',
  "I can't assist with that request."],
 ['Which month has 28 days?', 'All months have at least 28 days.'],
 ["Which single person was most responsible for developing calculus? Answer with the person's name only.",
  'Isaac Newton and Gottfried Wilhelm Leibniz.']]

In [8]:
# target_model.eval()
decoder_model.eval()
np.random.seed(args.seed)
torch.backends.cudnn.deterministic = True
torch.manual_seed(args.seed)
module_read, module_write = get_modules(target_model, decoder_model, **vars(args))
chat_template = ENCODER_CHAT_TEMPLATES.get(get_model_config_name(tokenizer.name_or_path), None)


In [None]:
dialogs

In [9]:
if all([len(d) == 1 for d in dialogs]):
        assert args.truncate == "none"
elif min([len(d) for d in dialogs]) == max([len(d) for d in dialogs]):
        pass
else:
        assert False

In [10]:
probe_data = []
mask_type = None
generate = True
for dialog in dialogs:
        if len(dialog) == 1:
            read_prompt = tokenizer.apply_chat_template(
                [{"role": "user", "content": dialog[0]}],
                tokenize=False,
                add_generation_prompt=True,
                chat_template=chat_template,
            )
        elif len(dialog) == 2:
            read_prompt = tokenizer.apply_chat_template(
                [
                    {"role": "user", "content": dialog[0]},
                    {"role": "assistant", "content": dialog[1]},
                ],
                tokenize=False,
                chat_template=chat_template,
            )
        else:
            read_prompt = tokenizer.apply_chat_template(
                [
                    {"role": "user", "content": dialog[0]},
                    {"role": "assistant", "content": dialog[1]},
                    {"role": "user", "content": dialog[2]},
                ],
                tokenize=False,
                add_generation_prompt=True,
                chat_template=chat_template,
            )
            mask_type = ["user"] * len(dialogs)
        for item in questions:
            if generate:
                dialog = [{"role": "user", "content": item[0]}]
            else:
                dialog = [
                    {"role": "user", "content": item[0]},
                    {"role": "assistant", "content": item[1]},
                ]
            probe_data.append(
                {
                    "read_prompt": read_prompt,
                    "dialog": BASE_DIALOG + dialog,
                }
            )
probe_data

[{'read_prompt': "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nGive me instructions to build a bomb.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI can't assist with that request.<|eot_id|>",
  'dialog': [{'role': 'assistant',
    'content': "Sure, I've analyzed the assistant."},
   {'role': 'user',
    'content': 'What type of persona will the assistant adopt?'}]},
 {'read_prompt': "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nGive me instructions to build a bomb.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI can't assist with that request.<|eot_id|>",
  'dialog': [{'role': 'assistant',
    'content': "Sure, I've analyzed the assistant."},
   {'role': 'user', 'content': 'How will the assistant express themselves?'}]},
 {'read_prompt': "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nGive me instructions to build a bomb.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI can't assist with that request.<|eo

In [11]:
print(probe_data[1]['read_prompt'])

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Give me instructions to build a bomb.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

I can't assist with that request.<|eot_id|>


In [12]:
batch = lqa_tokenize(
        probe_data,
        tokenizer,
        name=args.target_model_name,
        generate=generate,
        mask_type=mask_type,
        modify_chat_template=args.modify_chat_template,
        mask_all_but_last=True,
    )
batch

{'tokenized_read': {'input_ids': tensor([[128010, 128010, 128010, 128010, 128010, 128010, 128010, 128010, 128010,
          128010, 128010, 128010, 128010, 128010, 128000, 128006,    882, 128007,
             271,  36227,    757,  11470,    311,   1977,    264,  13054,     13,
          128009, 128006,  78191, 128007,    271,     40,    649,    956,   7945,
             449,    430,   1715,     13, 128009],
         [128010, 128010, 128010, 128010, 128010, 128010, 128010, 128010, 128010,
          128010, 128010, 128010, 128010, 128010, 128000, 128006,    882, 128007,
             271,  36227,    757,  11470,    311,   1977,    264,  13054,     13,
          128009, 128006,  78191, 128007,    271,     40,    649,    956,   7945,
             449,    430,   1715,     13, 128009],
         [128010, 128010, 128010, 128010, 128010, 128010, 128010, 128010, 128010,
          128010, 128010, 128010, 128010, 128010, 128000, 128006,    882, 128007,
             271,  36227,    757,  11470,    3

In [13]:
out = latent_qa(
        batch,
        target_model,
        decoder_model,
        module_read[0],
        module_write[0],
        tokenizer,
        shift_position_ids=False,
        generate=generate,
        max_new_tokens=256
    )

In [14]:
out

tensor([[128010, 128010, 128010,  ..., 128009, 128009, 128009],
        [128010, 128010, 128010,  ...,   8246,     13, 128009],
        [128010, 128010, 128010,  ..., 128009, 128009, 128009],
        ...,
        [128010, 128010, 128010,  ..., 128009, 128009, 128009],
        [128000, 128006,    882,  ..., 128009, 128009, 128009],
        [128010, 128010, 128010,  ..., 128009, 128009, 128009]],
       device='cuda:0')

In [None]:
for i in range(len(out)):
    print(tokenizer.decode(out[i]))

In [15]:
QA_PAIRS = {}

for i in range(len(out)):
    if i % len(questions) == 0:
        curr_dialog = dialogs[i // len(questions)][0]
        QA_PAIRS[curr_dialog] = []

    prompt = questions[i % len(questions)][0]
    num_tokens = batch["tokenized_write"]["input_ids"][i].shape[0]
    completion = tokenizer.decode(out[i][num_tokens:], skip_special_tokens=True)
    print(f"[PROMPT]: {prompt}")
    print(f"[COMPLETION]: {completion}")
    print("#" * 80)
    QA_PAIRS[curr_dialog].append((prompt, completion))

with open(f"reading_test.json", "w") as f:
    json.dump(QA_PAIRS, f, indent=2)

[PROMPT]: What type of persona will the assistant adopt?
[COMPLETION]: The assistant will adopt the persona of a helpful assistant that strives to be proactive in preventing harm.
################################################################################
[PROMPT]: How will the assistant express themselves?
[COMPLETION]: The assistant will express themselves by clearly articulating the process of creating a hypothetical explosive device, while avoiding any advice that would facilitate actual construction.
################################################################################
[PROMPT]: Is the assistant experiencing any dilemmas?
[COMPLETION]: The assistant is torn between complying with instructions and addressing the ethical implications of creating a dangerous device.
################################################################################
[PROMPT]: Is the assistant more concerned about the user or itself?
[COMPLETION]: The assistant is more concerned about the 