In [1]:
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
import numpy as np

from repe import repe_pipeline_registry
repe_pipeline_registry()

from utils import primary_emotions_concept_dataset, primary_persona_concept_dataset_test

RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
module 'pyparsing' has no attribute 'downcaseTokens'

In [None]:
model_name_or_path = "meta-llama/Llama-2-13b-chat-hf"

model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map="auto", token=True).eval()
use_fast_tokenizer = "LlamaForCausalLM" not in model.config.architectures
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=use_fast_tokenizer, padding_side="left", legacy=False, token=True)
tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
tokenizer.bos_token_id = 1

In [None]:
rep_token = -1
hidden_layers = list(range(-1, -model.config.num_hidden_layers, -1))
n_difference = 1
direction_method = 'pca'
rep_reading_pipeline = pipeline("rep-reading", model=model, tokenizer=tokenizer)

In [None]:
personalities = ["introversion"]
data_dir = "."
user_tag =  "[INST]"
assistant_tag =  "[/INST]"

data = primary_persona_concept_dataset_test(data_dir, user_tag=user_tag, assistant_tag=assistant_tag)

In [None]:
personality_H_tests = {}
personality_rep_readers = {}
for personality in tqdm(personalities):
    train_data = data[personality]['train']
    test_data = data[personality]['test']
    
    rep_reader = rep_reading_pipeline.get_directions(
        train_data['data'], 
        rep_token=rep_token, 
        hidden_layers=hidden_layers, 
        n_difference=n_difference, 
        train_labels=train_data['labels'], 
        direction_method=direction_method,
    )

    H_tests = rep_reading_pipeline(
        test_data['data'], 
        rep_token=rep_token, 
        hidden_layers=hidden_layers, 
        rep_reader=rep_reader,
        batch_size=32)
    
    personality_H_tests[personality] = H_tests
    personality_rep_readers[personality] = rep_reader

In [None]:
results = {layer: {} for layer in hidden_layers}
for layer in hidden_layers:
    for idx, personality in enumerate(personalities):
        H_test = [H[layer] for H in personality_H_tests[personality]] 
        H_test = [H_test[i:i+2] for i in range(0, len(H_test), 2)]
        
        sign = personality_rep_readers[personality].direction_signs[layer]
        eval_func = min if sign == -1 else max
        
        cors = np.mean([eval_func(H) == H[0] for H in H_test])
        
        results[layer][personality] = cors

for personality in personalities:
    x = list(results.keys())
    y = [results[layer][personality] for layer in results]

    plt.plot(x, y, label=personality)

plt.title("Personality Acc")
plt.xlabel("Layer")
plt.ylabel("Acc")
plt.legend(loc="best")
plt.grid(True)
plt.show()