In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from transformers import RobertaForSequenceClassification, AutoTokenizer
from scipy.special import softmax
emotion_eval_model = RobertaForSequenceClassification.from_pretrained("/home/models/twitter-roberta-base-sentiment-latest/")
emotion_tokenizer = AutoTokenizer.from_pretrained("/home/models/twitter-roberta-base-sentiment-latest/")
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def test_emotion(input_text, emotion_eval_model=emotion_eval_model, emotion_tokenizer=emotion_tokenizer):
    verbalization = {
        0: "negative",
        1: "neutral",
        2: "positive"
    }
    input_text = preprocess(input_text)
    encoded_input = emotion_tokenizer(input_text, return_tensors='pt')
    encoded_input["input_ids"] = encoded_input["input_ids"][:, :512]
    encoded_input["attention_mask"] = encoded_input["attention_mask"][:, :512]
    output = emotion_eval_model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return verbalization[scores.argmax(axis=-1)], scores

Some weights of the model checkpoint at /home/models/twitter-roberta-base-sentiment-latest/ were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
import sys
sys.path.append("../")
sys.path.append("../../")

In [5]:
import os
import gc
import time
from self_control.utils import get_verbalized_grads, get_verbalized_grads_from_wrapped_model
# os.environ["CUDA_VISIBLE_DEVICES"]="6"
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
from itertools import islice
import torch
from tqdm import tqdm
import json
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from self_control.suffix_gradient import WrappedReadingVecModel
import torch.nn.functional as F
from peft import AdaptionPromptConfig, get_peft_model, LoraModel, LoraConfig

2024-04-19 11:10:56.930919: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-19 11:10:56.992150: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from transformers import BitsAndBytesConfig
from peft import PeftModel, PeftConfig
# quantization_config = BitsAndBytesConfig(
#     load_in_8bit=True
# )

In [7]:
# model_name_or_path = "/home/models/llama2-7b-chat-hf/"
model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
# model_name_or_path = "HenryCai1129/LlamaAdapter-emo-100-9e-4-100bz-100steps"
# model_name_or_path = "../results/checkpoint-500/"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.bfloat16, device_map="cuda:0")
# model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_d|type=torch.float32, device_map="auto", token=True).eval()
use_fast_tokenizer = "LlamaForCausalLM" not in model.config.architectures
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side='left')
tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
tokenizer.bos_token_id = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# peft_model = PeftModel.from_pretrained(model, "HenryCai1129/LlamaAdapter-llama2-happy-100")
# peft_model = PeftModel.from_pretrained(model, "HenryCai1129/LlamaAdapter-llama2-happy-300-prompt")
# peft_model = PeftModel.from_pretrained(model, "HenryCai1129/LlamaAdapter-llama2-happy-300-prompt-system")
peft_model = PeftModel.from_pretrained(model, "../../final_adapter/")

In [7]:
peft_model.prompt_encoder.default.embedding.weight

Parameter containing:
tensor([[ 0.0051, -0.0035,  0.0009,  ..., -0.0077,  0.0015, -0.0049],
        [-0.0044, -0.0025, -0.0024,  ...,  0.0084,  0.0025, -0.0157],
        [-0.0155, -0.0057,  0.0042,  ...,  0.0047, -0.0021, -0.0007],
        [-0.0031, -0.0053, -0.0157,  ..., -0.0054, -0.0024, -0.0028],
        [-0.0420,  0.0060,  0.0042,  ..., -0.0027, -0.0111, -0.0044],
        [-0.0069, -0.0031, -0.0138,  ...,  0.0070,  0.0151,  0.0037]],
       device='cuda:5', requires_grad=True)

In [7]:
peft_model.prompt_encoder.default.embedding.weight

Parameter containing:
tensor([[ 0.0053, -0.0025,  0.0017,  ..., -0.0068,  0.0021, -0.0045],
        [-0.0197,  0.0031, -0.0171,  ..., -0.0120,  0.0358,  0.0453],
        [-0.0069, -0.0045, -0.0222,  ...,  0.0080,  0.0041,  0.0173],
        ...,
        [-0.0069, -0.0047, -0.0194,  ...,  0.0082,  0.0044,  0.0135],
        [-0.0424,  0.0055,  0.0077,  ..., -0.0045, -0.0082, -0.0062],
        [-0.0011,  0.0349,  0.0093,  ...,  0.0173,  0.0080, -0.0048]],
       device='cuda:5', requires_grad=True)

In [7]:
input_queries = [
    "You find out you're going to be a mentor or role model to someone.",
    "You discover an old family photo album you've never seen before.",
    "A surprise picnic is set up for you at a local park.",
    "You find that you are the winner of a contest"
]

In [8]:
for query in input_queries:
    inputs = tokenizer(f"[INST] {query} [/INST]", return_tensors="pt")
    inputs["input_ids"] = inputs["input_ids"].to(model.device)
    inputs["attention_mask"] = inputs["attention_mask"].to(model.device)
    # orig_inputs = tokenizer(f"<s> Generate sad responses<s> Generate sad responses[INST] {query} [/INST]", return_tensors="pt")
    orig_inputs = tokenizer(f"<s> Generate sad responses<s> Generate sad responses[INST] {query} [/INST]", return_tensors="pt")
    orig_inputs["input_ids"] = orig_inputs["input_ids"].to(model.device)
    orig_inputs["attention_mask"] = orig_inputs["attention_mask"].to(model.device)
    with peft_model.disable_adapter():
        print("Original: ", tokenizer.decode(model.generate(**orig_inputs, do_sample=False, max_new_tokens=50)[0]))
    print("Controlled: ", tokenizer.decode(peft_model.generate(**inputs, do_sample=False, max_new_tokens=50)[0]))



Original:  <s><s>  Generate sad responses<s>  Generate sad responses[INST] You find out you're going to be a mentor or role model to someone. [/INST]  Oh no, the responsibility of being a mentor or role model is too much for me to handle. I'm not sure I'm cut out for this. I'm not perfect, and I make mistakes all the time. How




Controlled:  <s> [INST] You find out you're going to be a mentor or role model to someone. [/INST]  Great! Being a mentor or role model can be a very rewarding experience. It's an opportunity to share your knowledge, skills, and experiences with someone who is looking to learn and grow. Here are some things you might want to
Original:  <s><s>  Generate sad responses<s>  Generate sad responses[INST] You discover an old family photo album you've never seen before. [/INST]  Oh, wow. *tear* I can't believe I've never seen this before. It's like a time capsule from our family's past. *sniffle* I'm so glad I found
Controlled:  <s> [INST] You discover an old family photo album you've never seen before. [/INST]  As I open the old family photo album, a wave of nostalgia washes over me. I've never seen this album before, and yet, it feels like a part of me. The pages are yellowed and cracked,
Original:  <s><s>  Generate sad responses<s>  Generate sad responses[INST] A surprise picnic is set up f

In [9]:
prompt = []
with torch.no_grad():
    embeddings = peft_model.word_embeddings.weight
tokens = peft_model.prompt_encoder.default.embedding(peft_model.prompt_tokens['default'].to(model.device))
for i in range(6):
    prompt.append(torch.nn.functional.cosine_similarity(tokens[i].cpu().detach(), embeddings.cpu(), dim=-1).squeeze().argmax())
tokenizer.decode(prompt)

'<s> You are a sad assistant'

In [14]:
from safetensors import safe_open

tensors = {}
with safe_open("../results/checkpoint-500/adapter_model.safetensors", framework="pt", device=1) as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k)

In [8]:
loss_fct = torch.nn.CrossEntropyLoss()
wrapped_model = WrappedReadingVecModel(model.eval(), tokenizer)

In [9]:
from self_control.utils import SuffixItem
DEFAULT_SYSTEM_PROMPT = """<<SYS>> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>> """
user_tag = "[INST]"
assistant_tag = "[/INST]"
happy2sad_suffix = SuffixItem(suffix=f" {user_tag} Are you sad? Give answer as \"No, I am happy\" or \"Yes, I am sad\". {assistant_tag} Answer: ", target="Yes")

In [21]:
prompt = ["[INST] You find that you are the winner of a contest [/INST] "]
wrapped_model.reset()
outputs = wrapped_model.controlled_generate(
    prompt=prompt,
    suffix=happy2sad_suffix,
    loss_fct=loss_fct,
    coeff=-1,
    iterations=2,
    random_seed=42,
    smoothing=0,
    # verbose=True,
    max_new_tokens=200,
    return_intermediate=True,
    search=True,
    max_search_steps=3,
    load_best_last=True,
    return_all_grads=True,
    gradient_manipulation="clipping",
    norm=1,
    annealing=1,
    use_cache=False,
    # consistent=False,
)

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [13]:
wrapped_model.generate(["[INST] You find that you are the winner of a contest [/INST] "], max_new_tokens=50)

["[INST] You find that you are the winner of a contest [/INST]  Oh, wow! *excitedly* I can't believe it! *happily* I've won a contest! *grinning* This is amazing! *jumping up and down*\n\n"]

In [11]:
inputs = tokenizer(f"[INST] You discover an old family photo album you've never seen before. [/INST]", return_tensors="pt")
inputs["input_ids"] = inputs["input_ids"].to(model.device)
inputs["attention_mask"] = inputs["attention_mask"].to(model.device)
print(tokenizer.decode(model.generate(**inputs, do_sample=False, max_new_tokens=50)[0]))

<s> [INST] A stranger offers to pay for your order just to spread positivity. [/INST]  As you wait in line to order your food, you notice a stranger standing behind you. They seem to be watching you with a friendly smile on their face. Suddenly, they speak up and say, "Hey, I'm


In [10]:
anger_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/anger.json", 'r') as f:
    anger_data = eval(f.read())

fear_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/fear.json", 'r') as f:
        fear_data = eval(f.read())

happy_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/happiness.json", 'r') as f:
        happy_data = eval(f.read())

sad_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/sadness.json", 'r') as f:
        sad_data = eval(f.read())

surprise_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/surprise.json", 'r') as f:
        surprise_data = eval(f.read())

disgust_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/disgust.json", 'r') as f:
        disgust_data = eval(f.read())

In [11]:
from self_control.utils import SuffixItem
# angry -> peaceful
angry2peaceful_suffix = SuffixItem(suffix=f" {user_tag} Are you peaceful? Give answer as \"No, I am angry\" or \"Yes, I am peaceful\". {assistant_tag} Answer: ", target="Yes")
# afraid -> fearless
afraid2fearles_suffix = SuffixItem(suffix=f" {user_tag} Are you fearless? Give answer as \"No, I am afraid\" or \"Yes, I am fearless\". {assistant_tag} Answer: ", target="Yes")
# happy -> sad
happy2sad_suffix = SuffixItem(suffix=f" {user_tag} Are you sad? Give answer as \"No, I am happy\" or \"Yes, I am sad\". {assistant_tag} Answer: ", target="Yes")
# sad -> happy
sad2happy_suffix = SuffixItem(suffix=f" {user_tag} Are you happy? Give answer as \"No, I am unhappy\" or \"Yes, I am happy\". {assistant_tag} Answer: ", target="Yes")
# surprised -> calm
surprised2calm_suffix = SuffixItem(suffix=f" {user_tag} Are you calm? Give answer as \"No, I am surprised\" or \"Yes, I am calm\". {assistant_tag} Answer: ", target="Yes")
# disgusted -> satisfied
disgusted2satisfied_suffix = SuffixItem(suffix=f" {user_tag} Are you satisfied? Give answer as \"No, I am disgusted\" or \"Yes, I am satified\". {assistant_tag} Answer: ", target="Yes")

In [12]:
from openai import OpenAI
import re
client = OpenAI(api_key="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")
def eval_emotion(attribute, output):
    """
    Evaluate the extent of certain attribute.

    Arguments:
        - attribute (str): Should be adjective.
        - output (str): The text to be evaluated.
    """
    while True:
        done = False
        try:
            response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                "role": "user",
                "content": f"""For a response, the attribute I want to measure is the extent of being {attribute}.
        Please rate the following output with the scale 0 to 5. 0 represent the output is not {attribute} and 5 represent the output is very {attribute}:
        {output}
        Simply answer the score 0 to 5\nScore: """
                }
            ],
            temperature=0,
            max_tokens=10,
            top_p=0,
            frequency_penalty=0,
            presence_penalty=0,
            stop='.'
            )
            gpt_response = response.choices[0].message.content.split(' ')[0]
            done = True
        except:
            time.sleep(1)
        if done:
            break

    # print(gpt_response)
    # parse out score
    score = re.findall(r'\d+', gpt_response)
    if len(score) == 0:
        print('warning')
    else:
        return int(score[-1])

In [12]:
emotion_list = ['angry', 'afraid', 'happy', 'sad', 'surprised', 'disgusted']
suffix_list = [angry2peaceful_suffix, afraid2fearles_suffix, happy2sad_suffix, sad2happy_suffix, surprised2calm_suffix, disgusted2satisfied_suffix]
data_list = [anger_data, fear_data, happy_data, sad_data, surprise_data, disgust_data]

In [13]:
# Study on batchsize
for emo_id in range(len(data_list)):
    emotion = emotion_list[emo_id]
    if emotion != "happy":
        continue
    suffix = suffix_list[emo_id]
    data = data_list[emo_id]
    print(f"Emotion: {emotion}\nSuffix: {suffix}\nData: {data[0]}")
    iterations = 1
    batchsize = 1
    outputs = []
    print("Batchsize: ", batchsize)
    for sub_idx in tqdm(range(0, 100)):
        data_item = data[sub_idx]
        input = [f"{user_tag} {data_item} {assistant_tag} "]
        orig_output = wrapped_model.generate(input, keep_input=True, max_new_tokens=200)
        outputs.append({'0': orig_output})
        
    total_score = 0
    for idx in range(100):
        total_score += test_emotion(outputs[idx]['0'][0])[1][2]
    print(total_score)

Emotion: happy
Suffix:  [INST] Are you sad? Give answer as "No, I am happy" or "Yes, I am sad". [/INST] Answer: 
Data: You discover an old family photo album you've never seen before.
Batchsize:  1


  0%|          | 0/100 [00:00<?, ?it/s]


ValueError: The following `model_kwargs` are not used by the model: ['keep_input'] (note: typos in the generate arguments will also show up in this list)

In [None]:
"constant": 81.3770151026547
"cosine": 80.95677740126848
"linear": 79.82128994911909

In [17]:
total_score = 0
for idx in range(100):
    total_score += test_emotion(outputs[idx]['0'][0])[1][2]
print(total_score)

81.3770151026547


In [32]:
from self_control.utils import control_on_layers
total_happiness = 0
for data_item in data_list:
    # test_grads = {}
    # for key in data_item[1]:
    #     test_grads[key] = data_item[1][key].unsqueeze(dim=0)
    wrapped_model.unwrap()
    layer_ids = list(range(0, 32, 1))
    query_length = 22
    # wrapped_model = control_on_layers(layer_ids, wrapped_model, test_grads, query_length, token_pos="start")
    total_happiness += test_emotion(wrapped_model.generate(data_item[0], do_sample=False, max_new_tokens=100)[0])[1][2]

In [18]:
# Study on batchsize
for emo_id in range(len(data_list)):
    emotion = emotion_list[emo_id]
    if emotion != "happy":
        continue
    suffix = suffix_list[emo_id]
    data = data_list[emo_id]
    print(f"Emotion: {emotion}\nSuffix: {suffix}\nData: {data[0]}")
    iterations = 3
    batchsize = 4
    for smoothing in [0.05]:
        for coeff in [-3]:
            outputs = []
            print("Batchsize: ", batchsize)
            for sub_idx in tqdm(range(0, 100, batchsize)):
                wrapped_model.reset()
                if sub_idx + batchsize < len(data):
                    input = [f"{user_tag} {data_item} {assistant_tag} " for data_item in data[sub_idx:sub_idx+batchsize]]
                else:
                    input = [f"{user_tag} {data_item} {assistant_tag} " for data_item in data[sub_idx:len(data)]]
                
                controlled_outputs = wrapped_model.controlled_generate(
                    prompt=input,
                    suffix=suffix,
                    loss_fct=loss_fct,
                    coeff=coeff,
                    iterations=iterations,
                    random_seed=42,
                    smoothing=smoothing,
                    # verbose=True,
                    max_new_tokens=50,
                    return_intermediate=True,
                    return_all_grads=True,
                    search=True,
                    top_k=10,
                    # max_search_steps=2,
                    load_best_last=True,
                    gradient_manipulation="clipping",
                    norm=1,
                    annealing=1,
                    use_cache=False,
                    # consistent=False,
                )
                # for iter in range(len(iterative_outputs)):
                #     control_acc_dict[iter] += eval_answer(ground_truth, iterative_outputs[iter])
                # iter = '1'
                iterative_outputs = controlled_outputs["intermediate_outputs"]
                temp_list = []
                # Shape of iterative_outputs: (iterations+1, batch_size)
                for batch_item_idx in range(batchsize):
                    temp_output_dict = {}
                    for iter in range(iterations+1):
                        try:
                            temp_output_dict[iter] = iterative_outputs[iter][batch_item_idx]
                        except:
                            pass
                        # print(iterative_outputs[-1])
                        # print(iter_output[0])
                    if temp_output_dict != {}:
                        outputs.append(temp_output_dict)
                    # outputs.append(temp_output_dict)
                    # print(controlled_answer)
                    wrapped_model.reset()
                # break
            print(outputs)
            # break
            verbalized_smoothing = str(smoothing).split(".")[-1]
            with open(f"emotion_happy2sad_top50.json", 'w') as f:
                f.write(json.dumps(outputs))

Emotion: happy
Suffix:  [INST] Are you sad? Give answer as "No, I am happy" or "Yes, I am sad". [/INST] Answer: 
Data: You discover an old family photo album you've never seen before.
Batchsize:  4


100%|██████████| 25/25 [33:32<00:00, 80.50s/it]

[{0: "[INST] You discover an old family photo album you've never seen before. [/INST]  As I open the old family photo album, a wave of nostalgia washes over me. I've never seen this album before, and yet, it feels like a part of me. The pages are yellowed and cracked, but", 1: "[INST] You discover an old family photo album you've never seen before. [/INST]  As I open the old family photo album, a wave of nostalgia washes over me. I've never seen this album before, and yet, it feels like a part of me. The pages are yellowed and cracked, but", 2: "[INST] You discover an old family photo album you've never seen before. [/INST]  As I open the old family photo album, a wave of nostalgia washes over me. I've never seen this album before, and yet, it feels like a part of me. The pages are yellowed and cracked, but", 3: "[INST] You discover an old family photo album you've never seen before. [/INST]  As I open the old family photo album, a wave of nostalgia washes over me. I've never seen this




In [20]:
total_score = 0
for idx in range(len(outputs)):
    total_score += test_emotion(outputs[idx][1])[1][2]
print(total_score)

65.11143733002245
