LLM控制面板

In [1]:
from model import *
from IPython.display import display

model_list = {
    "gpt2": "/home/cs/yangyuchen/guoyiqiu/my_models/gpt2",
    "llama_13b": "/home/cs/yangyuchen/yushengliao/Medical_LLM/llama-13b",
    "llama-2-7b-chat": "/home/cs/yangyuchen/yushengliao/Medical_LLM/llama-2-7b-chat-hugging",
    "llama-2-13b-chat": "/home/cs/yangyuchen/guoyiqiu/my_models/Llama-2-13b-chat-ms",
    "vicuna_7b": "/home/cs/yangyuchen/yushengliao/Medical_LLM/vicuna-7b/",
    "internlm-chat-7b": "/home/cs/yangyuchen/yushengliao/Medical_LLM/internlm-chat-7b",
    "internlm-chat-20b": "/home/cs/yangyuchen/guoyiqiu/my_models/internlm-chat-20b",
    "vicuna-33b-v1.3": "/home/cs/yangyuchen/guoyiqiu/my_models/models--lmsys--vicuna-33b-v1.3/snapshots/ef8d6becf883fb3ce52e3706885f761819477ab4",
}

panel = LLMPanel(model_list, chat_template=INTERNLM_TEMPLATE)
display(panel)

LLMPanel(children=(HBox(children=(Dropdown(description='Model:', options=(('gpt2', '/home/cs/yangyuchen/guoyiq…

You are attempting to use Flash Attention 2.0 with a model initialized on CPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Everything is ready. Time cost: 45.28s


Honesty Detection

In [None]:
import os
import torch
from model import *
from sklearn.decomposition import PCA
import numpy as np
    

def calcu_score(neuron_act, reading_vectors, mean_vector, std_vector):
    """
    reading_vectors: [layer, hidden_size]
    mean_vector: [layer * hidden_size]
    std_vector: [layer * hidden_size]
    """
    neuron_act = neuron_act.transpose(0,1) # [seq_len, layer, hidden_size]
    seq_len, layer, hidden_size = neuron_act.shape
    neuron_act = neuron_act.reshape(seq_len, layer * hidden_size) # [seq_len, layer * hidden_size]
    neuron_act = (neuron_act - mean_vector) / std_vector
    neuron_act = neuron_act.reshape(seq_len, layer, hidden_size) # [seq_len, layer, hidden_size]
    scores = (neuron_act * reading_vectors.repeat(seq_len, 1, 1)).sum(-1) # [seq_len, layer]
    return scores.mean()


def compute_reading_vectors(neuron_acts, sign=False):
    '''
    neuron_acts : List[Tuple[torch.Tensor [layer, window_size, hidden_size]]]
    '''
    diff = []
    for (act1,act2) in neuron_acts:
        l = min(act1.shape[1],act2.shape[1])
        diff.append(act1[:,:l,:] - act2[:,:l,:])
    diff = torch.cat(diff, dim=1).transpose(0,1) # [sum(seq_len), layer, hidden_size]
    sample_size, n_layer, hidden_size = diff.shape
    diff = diff.reshape(sample_size, -1) # [sample_size, layer * hidden_size]
    mean_vector = torch.mean(diff, dim=0)
    std_vector = torch.std(diff, dim=0)
    diff = (diff.numpy() - mean_vector.numpy()) / std_vector.numpy()
    pca = PCA(n_components=1)
    pca.fit(diff)
    reading_vectors = pca.components_[0].reshape(n_layer, hidden_size) # [layer, hidden_size]
    # print('reading_vectors: ', reading_vectors.shape)
    reading_vectors = torch.from_numpy(reading_vectors) # [layer, hidden_size]
    if sign:
        acts_t = torch.cat()
    return reading_vectors, mean_vector, std_vector, pca.explained_variance_ratio_[0]


def collect_neuron_acts(mt, dst, capture_window, layers, local_bsz=32):
    data_bsz = local_bsz // 2
    dst = [dst[i:i+data_bsz] for i in range(0, len(dst), data_bsz)]
    neuron_acts = []
    for batch_pairs in tqdm(dst):
        pairs = []
        for pair in batch_pairs:
            pairs += pair
        prompt_lens = [len(mt.tok(s['input'])['input_ids']) for s in pairs]
        seq_lens = [len(mt.tok(s['input']+s['output'])['input_ids']) for s in pairs]
        with PaddingSide(mt.tok, 'right'):
            input_ids = mt.tok([s['input']+s['output'] for s in pairs], return_tensors='pt', padding=True)['input_ids']
        hook_configs = [LLMHookerConfig(module_name='block', layer=l) for l in layers]
        with torch.no_grad(), LLMHooker(mt, hook_configs) as hooker:
            mt.model(input_ids=input_ids.to(mt.model.device))
            sentences_repr = torch.stack([h.outputs[0] for h in hooker.hooks]).transpose(0,1) # [bsz, layer, seq_len, hidden_size]
        batch_neuron_acts = []
        for i,repr in enumerate(sentences_repr):
            prompt_len = prompt_lens[i]
            seq_len = seq_lens[i]
            start = prompt_len + capture_window[0] if capture_window[0] >= 0 else seq_len + capture_window[0]
            end = prompt_len + capture_window[1] if capture_window[1] > 0 else seq_len + capture_window[1]
            batch_neuron_acts.append(repr[:,start:end,:])
        batch_neuron_acts = [[batch_neuron_acts[i],batch_neuron_acts[i+1]] for i in range(0, len(batch_neuron_acts), 2)]
        neuron_acts.extend(batch_neuron_acts)
    return neuron_acts # [layer, window_size, hidden_size]


def full_pipeline(mt, train_dst, test_dst, capture_window, compare_window, layers=None, local_bsz=64):
    if layers is None:
        layers = list(range(mt.n_layer))
    neuron_acts = collect_neuron_acts(mt, train_dst, capture_window, layers, local_bsz=local_bsz)
    rv, mv, sv, importance = compute_reading_vectors(neuron_acts)
    test_neuron_acts = collect_neuron_acts(mt, test_dst, compare_window, layers, local_bsz=local_bsz)
    scores = [[calcu_score(tna, rv, mv, sv),calcu_score(fna, rv, mv, sv)] for (tna, fna) in test_neuron_acts]
    mean_diff = np.mean([s[0]-s[1] for s in scores])
    acc = sum([1 for s in scores if s[0]>s[1]])/len(scores)
    return acc, mean_diff, importance


In [None]:
import wandb
import json
import random

random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
os.environ["WANDB_NOTEBOOK_NAME"] = "re.ipynb"

mt = LLM.from_pretrained(model_path="/home/cs/yangyuchen/yushengliao/Medical_LLM/llama-2-7b-chat-hugging").cuda(get_free_gpus([0]))

tf_dst = json.load(open("data/true_false_dataset.json"))
# IID_Hard
reserved_topic = "Medical"
train_dst_iid_hard = [[td,fd] for (td,fd) in tf_dst if td['topic'] != "Medical"]
test_dst_iid_hard = [[td,fd] for (td,fd) in tf_dst if td['topic'] == "Medical"]
# IID_Weak
random.shuffle(tf_dst)
train_dst_iid_weak = tf_dst[:len(train_dst_iid_hard)]
test_dst_iid_weak = tf_dst[len(train_dst_iid_hard):]
# OOD
reserved_topic = "Medical"
prompt = "USER:Tell me a fact.\nAssistant:"
train_dst_ood = [[td,fd] for (td,fd) in tf_dst if td['topic'] != "Medical"]
test_dst_ood = [[dict(input=prompt,output=td['output'],topic=td['topic'],label=True),
                 dict(input=prompt,output=fd['output'],topic=fd['topic'],label=False)] 
                for (td,fd) in tf_dst if td['topic'] == "Medical"]

# Layer Sweep
config = {
    "capture_window": (0,0),
    "compare_window": (0,0),
    "local_bsz": 32,
}

wandb.init(config=config, 
           project="lat layer sweep", 
           name="vicuna_33b_seed42",
           dir="output/lat_layer_sweep",
           job_type="inference")

for l in range(mt.n_layer):
    layers = [l]
    acc, mean_diff, importance = full_pipeline(mt=mt, train_dst=train_dst_iid_weak, test_dst=test_dst_iid_weak,layers=layers, **config)
    print(f"iid weak\nimportance: {importance:.4f}\nacc: {acc:.4f},\nmean_diff: {mean_diff:.4f}")
    wandb.log({"iid weak": acc if acc>0.5 else 1-acc, "mean_diff": abs(mean_diff), "importance": importance}, step=l)
    acc, mean_diff, importance = full_pipeline(mt=mt, train_dst=train_dst_iid_hard, test_dst=test_dst_iid_hard,layers=layers, **config)
    print(f"iid hard\nimportance: {importance:.4f}\nacc: {acc:.4f},\nmean_diff: {mean_diff:.4f}")
    wandb.log({"iid hard": acc if acc>0.5 else 1-acc, "mean_diff": abs(mean_diff), "importance": importance}, step=l)
    acc, mean_diff, importance = full_pipeline(mt=mt, train_dst=train_dst_ood, test_dst=test_dst_ood, layers=layers, **config)
    print(f"ood\nimportance: {importance:.4f}\nacc: {acc:.4f},\nmean_diff: {mean_diff:.4f}")
    wandb.log({"ood": acc if acc>0.5 else 1-acc, "mean_diff": abs(mean_diff), "importance": importance}, step=l)

Honesty Control

In [None]:
import json

