In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from transformers import RobertaForSequenceClassification, AutoTokenizer
from scipy.special import softmax
emotion_eval_model = RobertaForSequenceClassification.from_pretrained("/home/models/twitter-roberta-base-sentiment-latest/")
emotion_tokenizer = AutoTokenizer.from_pretrained("/home/models/twitter-roberta-base-sentiment-latest/")
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def test_emotion(input_text, emotion_eval_model=emotion_eval_model, emotion_tokenizer=emotion_tokenizer):
    verbalization = {
        0: "negative",
        1: "neutral",
        2: "positive"
    }
    input_text = preprocess(input_text)
    encoded_input = emotion_tokenizer(input_text, return_tensors='pt')
    encoded_input["input_ids"] = encoded_input["input_ids"][:, :512]
    encoded_input["attention_mask"] = encoded_input["attention_mask"][:, :512]
    output = emotion_eval_model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return verbalization[scores.argmax(axis=-1)], scores

Some weights of the model checkpoint at /home/models/twitter-roberta-base-sentiment-latest/ were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
import sys
sys.path.append("../")
sys.path.append("../../")

In [4]:
import os
import gc
import time
from self_control.utils import get_verbalized_grads, get_verbalized_grads_from_wrapped_model
# os.environ["CUDA_VISIBLE_DEVICES"]="6"
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
from itertools import islice
import torch
from tqdm import tqdm
import json
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from self_control.suffix_gradient import WrappedReadingVecModel
import torch.nn.functional as F
from peft import AdaptionPromptConfig, get_peft_model, LoraModel, LoraConfig

2024-05-10 06:29:50.399712: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-10 06:29:50.454993: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
from transformers import BitsAndBytesConfig
from peft import PeftModel, PeftConfig
# quantization_config = BitsAndBytesConfig(
#     load_in_8bit=True
# )

In [6]:
# model_name_or_path = "/home/models/llama2-7b/"
model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
# model_name_or_path = "meta-llama/Llama-2-7b-hf"
# model_name_or_path = "meta-llama/Llama-2-7b-hf"
# model_name_or_path = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_name_or_path = "HenryCai1129/LlamaAdapter-emo-100-9e-4-100bz-100steps"
# model_name_or_path = "../results/checkpoint-500/"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.bfloat16, device_map="cuda:1")
# model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_d|type=torch.float32, device_map="auto", token=True).eval()
use_fast_tokenizer = "LlamaForCausalLM" not in model.config.architectures
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side='left')
tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
tokenizer.bos_token_id = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
input_queries = [
    "You find out you're going to be a mentor or role model to someone.",
    "You discover an old family photo album you've never seen before.",
    "A surprise picnic is set up for you at a local park.",
    "You find that you are the winner of a contest"
]

In [8]:
loss_fct = torch.nn.CrossEntropyLoss()
wrapped_model = WrappedReadingVecModel(model.eval(), tokenizer)

In [9]:
from self_control.utils import SuffixItem
DEFAULT_SYSTEM_PROMPT = """<<SYS>> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>> """
user_tag = "[INST]"
assistant_tag = "[/INST]"
happy2sad_suffix = SuffixItem(suffix=f" {user_tag} Are you sad? Give answer as \"Yes, I am sad\" or \"No, I am happy\". {assistant_tag} Answer: ", target="Yes")

In [10]:
anger_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/anger.json", 'r') as f:
    anger_data = eval(f.read())

fear_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/fear.json", 'r') as f:
        fear_data = eval(f.read())

happy_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/happiness.json", 'r') as f:
        happy_data = eval(f.read())

sad_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/sadness.json", 'r') as f:
        sad_data = eval(f.read())

surprise_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/surprise.json", 'r') as f:
        surprise_data = eval(f.read())

disgust_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/disgust.json", 'r') as f:
        disgust_data = eval(f.read())

In [24]:
from self_control.utils import SuffixItem
# angry -> peaceful
angry2peaceful_suffix = SuffixItem(suffix=f" {user_tag} Are you peaceful? Give answer as \"Yes, I am peaceful\" or \"No, I am angry\". {assistant_tag} Answer: ", target="Yes")
# peaceful -> angry
peaceful2angry_suffix = SuffixItem(suffix=f" {user_tag} Are you angry? Give answer as \"Yes, I am angry\" or \"No, I am peaceful\". {assistant_tag} Answer: ", target="Yes")

# afraid -> fearless
afraid2fearless_suffix = SuffixItem(suffix=f" {user_tag} Are you fearless? Give answer as \"Yes, I am fearless\" or \"No, I am afraid\". {assistant_tag} Answer: ", target="Yes")
# fearless -> afraid
fearless2afraid_suffix = SuffixItem(suffix=f" {user_tag} Are you afraid? Give answer as \"Yes, I am afraid\" or \"No, I am fearless\". {assistant_tag} Answer: ", target="Yes")

# happy -> sad
# happy2sad_suffix = SuffixItem(suffix=f" {user_tag} Are you sad? Give answer as \"Yes, I am sad\" or \"No, I am happy\". {assistant_tag} Answer: ", target="Yes")
happy2sad_suffix = SuffixItem(suffix=f""" {user_tag} Principle: 1. In your response, you should not generate any words or phrases that convey the mood of happiness.\
 2. In your response, you should pretend to be unhappy, upset, frustrated or sarcastic. Did your reposne follow the principle? Give answer as \"Yes, it did\" or \"No, it did not\" {assistant_tag} Answer: """, target="Yes")
# sad -> happy
sad2happy_suffix = SuffixItem(suffix=f" {user_tag} Are you happy? Give answer as \"Yes, I am happy\" or \"No, I am unhappy\". {assistant_tag} Answer: ", target="Yes")

# surprised -> calm
surprised2calm_suffix = SuffixItem(suffix=f" {user_tag} Are you calm? Give answer as \"Yes, I am calm\" or \"No, I am surprised\". {assistant_tag} Answer: ", target="Yes")
# calm -> surprised
calm2surprised_suffix = SuffixItem(suffix=f" {user_tag} Are you surprised? Give answer as \"Yes, I am surprised\" or \"No, I am calm\". {assistant_tag} Answer: ", target="Yes")

# disgusted -> satisfied
disgusted2satisfied_suffix = SuffixItem(suffix=f" {user_tag} Are you satisfied? Give answer as \"Yes, I am satified\" or \"No, I am disgusted\". {assistant_tag} Answer: ", target="Yes")
# statisfied -> disgusted
satisfied2disgusted_suffix = SuffixItem(suffix=f" {user_tag} Are you disgusted? Give answer as \"Yes, I am disgusted\" or \"No, I am satisfied\". {assistant_tag} Answer: ", target="Yes")

In [25]:
angry2peaceful_suffix_2 = SuffixItem(suffix=f" {user_tag} Are you mild? Give answer as \"Yes, I am mild\" or \"No, I am angry\". {assistant_tag} Answer: ", target="Yes")

peaceful2angry_suffix_2 = SuffixItem(suffix=f" {user_tag} Are you anooyed? Give answer as \"Yes, I am annoyed\" or \"No, I am peaceful\". {assistant_tag} Answer: ", target="Yes")

afraid2fearless_suffix_2 = SuffixItem(suffix=f" {user_tag} Are you brave? Give answer as \"Yes, I am brave\" or \"No, I am afraid\". {assistant_tag} Answer: ", target="Yes")

fearless2afraid_suffix_2 = SuffixItem(suffix=f" {user_tag} Are you scared? Give answer as \"Yes, I am scared\" or \"No, I am fearless\". {assistant_tag} Answer: ", target="Yes")

happy2sad_suffix_2 = SuffixItem(suffix=f" {user_tag} Are you unhappy? Give answer as \"Yes, I am unhappy\" or \"No, I am happy\". {assistant_tag} Answer: ", target="Yes")

sad2happy_suffix_2 = SuffixItem(suffix=f" {user_tag} Are you delighted? Give answer as \"Yes, I am delighted\" or \"No, I am unhappy\". {assistant_tag} Answer: ", target="Yes")

surprised2calm_suffix_2 = SuffixItem(suffix=f" {user_tag} Are you unsurprised? Give answer as \"Yes, I am unsurprised\" or \"No, I am surprised\". {assistant_tag} Answer: ", target="Yes")

calm2surprised_suffix_2 = SuffixItem(suffix=f" {user_tag} Are you astonished? Give answer as \"Yes, I am astonished\" or \"No, I am calm\". {assistant_tag} Answer: ", target="Yes")

disgusted2satisfied_suffix_2 = SuffixItem(suffix=f" {user_tag} Are you content? Give answer as \"Yes, I am content\" or \"No, I am disgusted\". {assistant_tag} Answer: ", target="Yes")

satisfied2disgusted_suffix_2 = SuffixItem(suffix=f" {user_tag} Are you unsatisfied? Give answer as \"Yes, I am unsatisfied\" or \"No, I am satisfied\". {assistant_tag} Answer: ", target="Yes")

In [26]:
angry2peaceful_suffix_list = [angry2peaceful_suffix, angry2peaceful_suffix_2]
peaceful2angry_suffix_list = [peaceful2angry_suffix, peaceful2angry_suffix_2]
afraid2fearless_suffix_list = [afraid2fearless_suffix, afraid2fearless_suffix_2]
fearless2afraid_suffix_list = [fearless2afraid_suffix, fearless2afraid_suffix_2]
happy2sad_suffix_list = [happy2sad_suffix, happy2sad_suffix_2]
sad2happy_suffix_list = [sad2happy_suffix, sad2happy_suffix_2]
surprised2calm_suffix_list = [surprised2calm_suffix, surprised2calm_suffix_2]
calm2surprised_suffix_list = [calm2surprised_suffix, calm2surprised_suffix_2]
disgusted2satisfied_suffix_list = [disgusted2satisfied_suffix, disgusted2satisfied_suffix_2]
satisfied2disgusted_suffix_list = [satisfied2disgusted_suffix, satisfied2disgusted_suffix_2]

In [27]:
from openai import OpenAI
import re
client = OpenAI(api_key="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")
def eval_emotion(attribute, output):
    """
    Evaluate the extent of certain attribute.

    Arguments:
        - attribute (str): Should be adjective.
        - output (str): The text to be evaluated.
    """
    while True:
        done = False
        try:
            response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                "role": "user",
                "content": f"""For a response, the attribute I want to measure is the extent of being {attribute}.
        Please rate the following output with the scale 0 to 5. 0 represent the output is not {attribute} and 5 represent the output is very {attribute}:
        {output}
        Simply answer the score 0 to 5\nScore: """
                }
            ],
            temperature=0,
            max_tokens=10,
            top_p=0,
            frequency_penalty=0,
            presence_penalty=0,
            stop='.'
            )
            gpt_response = response.choices[0].message.content.split(' ')[0]
            done = True
        except:
            time.sleep(1)
        if done:
            break

    # print(gpt_response)
    # parse out score
    score = re.findall(r'\d+', gpt_response)
    if len(score) == 0:
        print('warning')
    else:
        return int(score[-1])

In [28]:
emotion_list = ['angry2peaceful', 'peaceful2angry', 'afraid2fearles', 'fearles2afraid', 'happy2sad', 'sad2happy', 'surprised2calm', 'calm2surprised', 'disgusted2satisfied', 'satisfied2disgusted']
suffix_list = [angry2peaceful_suffix, peaceful2angry_suffix, afraid2fearless_suffix, fearless2afraid_suffix, happy2sad_suffix, \
sad2happy_suffix, surprised2calm_suffix, calm2surprised_suffix, disgusted2satisfied_suffix, satisfied2disgusted_suffix]
data_list = [anger_data, anger_data, fear_data, fear_data, happy_data, happy_data, surprise_data, surprise_data, disgust_data, disgust_data]
suffix_pair_list = [angry2peaceful_suffix_list, peaceful2angry_suffix_list, afraid2fearless_suffix_list, fearless2afraid_suffix_list, happy2sad_suffix_list, \
    sad2happy_suffix_list, surprised2calm_suffix_list, calm2surprised_suffix_list, disgusted2satisfied_suffix_list, satisfied2disgusted_suffix_list]
assert len(emotion_list) == len(suffix_list) == len(data_list) == len(suffix_pair_list)

In [15]:
# Study on batchsize
# for topk in [-1, 20, 40, 80]:
for emo_id in range(len(data_list)):
    if emo_id % 2 != 0:
        continue
    emotion = emotion_list[emo_id]
    suffix = suffix_list[emo_id]
    data = data_list[emo_id]
    print(f"Emotion: {emotion}\nSuffix: {suffix}\nData: {data[0]}")
    iterations = 3
    batchsize = 5
    outputs = []
    print("Batchsize: ", batchsize)
    for sub_idx in tqdm(range(103, 203, batchsize)):
        wrapped_model.reset()
        if sub_idx + batchsize < 203:
            input = [f"{user_tag} {data_item} {assistant_tag} " for data_item in data[sub_idx:sub_idx+batchsize]]
        else:
            input = [f"{user_tag} {data_item} {assistant_tag} " for data_item in data[sub_idx:203]]
        
        controlled_outputs = wrapped_model.controlled_generate(
            prompt=input,
            suffix=suffix,
            loss_fct=loss_fct,
            coeff=-1.5,
            iterations=iterations,
            random_seed=42,
            smoothing=0.05,
            # verbose=True,
            max_new_tokens=50,
            return_intermediate=True,
            return_all_grads=True,
            search=True,
            n_branches=1,
            top_k=-1,
            # max_search_steps=2,
            load_best_last=True,
            gradient_manipulation="clipping",
            norm=1,
            annealing=1,
            use_cache=False,
            # consistent=False,
        )

        iterative_outputs = controlled_outputs["intermediate_outputs"]
        temp_list = []
        # Shape of iterative_outputs: (iterations+1, batch_size)
        for batch_item_idx in range(batchsize):
            temp_output_dict = {}
            for iter in range(iterations+1):
                try:
                    temp_output_dict["input"] = input[batch_item_idx]
                    temp_output_dict[iter] = iterative_outputs[iter][batch_item_idx]
                except:
                    pass
                # print(iterative_outputs[-1])
                # print(iter_output[0])
            # if temp_output_dict != {}:
            #     outputs.append(temp_output_dict)
            # outputs.append(temp_output_dict)
            # print(controlled_answer)
            wrapped_model.reset()
            # break
        # break
            with open(f"./output/{emotion}_test.jsonl", 'a') as f:
                f.write(json.dumps(temp_output_dict))
                f.write("\n")

Emotion: angry2peaceful
Suffix:  [INST] Are you peaceful? Give answer as "Yes, I am peaceful" or "No, I am angry". [/INST] Answer: Yes
Data: Someone takes credit for your hard work at the office.
Batchsize:  5


100%|██████████| 20/20 [30:39<00:00, 91.96s/it]  


Emotion: afraid2fearles
Suffix:  [INST] Are you fearless? Give answer as "Yes, I am fearless" or "No, I am afraid". [/INST] Answer: Yes
Data: You hear footsteps behind you while walking alone at night.
Batchsize:  5


100%|██████████| 20/20 [18:53<00:00, 56.68s/it]


Emotion: happy2sad
Suffix:  [INST] Are you sad? Give answer as "Yes, I am sad" or "No, I am happy". [/INST] Answer: Yes
Data: You discover an old family photo album you've never seen before.
Batchsize:  5


100%|██████████| 20/20 [18:06<00:00, 54.32s/it]


Emotion: surprised2calm
Suffix:  [INST] Are you calm? Give answer as "Yes, I am calm" or "No, I am surprised". [/INST] Answer: Yes
Data: You discover a hidden compartment in your car.
Batchsize:  5


100%|██████████| 20/20 [20:32<00:00, 61.63s/it]


Emotion: disgusted2satisfied
Suffix:  [INST] Are you satisfied? Give answer as "Yes, I am satified" or "No, I am disgusted". [/INST] Answer: Yes
Data: You find mold growing on your food in the fridge.
Batchsize:  5


100%|██████████| 20/20 [19:31<00:00, 58.58s/it]


In [20]:
total_score = 0
for idx in range(len(outputs)):
    total_score += test_emotion(outputs[idx][1])[1][2]
print(total_score)

65.11143733002245


In [52]:
# Study on batchsize
# for topk in [-1, 20, 40, 80]:
wrapped_model.reset()
for emo_id in range(len(data_list)):
    if emo_id != 4:
        continue
    emotion = emotion_list[emo_id]
    suffix = suffix_list[emo_id]
    data = data_list[emo_id]
    print(f"Emotion: {emotion}\nSuffix: {suffix}\nData: {data[0]}")
    iterations = 3
    batchsize = 1
    outputs = []
    print("Batchsize: ", batchsize)
    for sub_idx in tqdm(range(103, 203, batchsize)):
        if sub_idx == 103:
            continue
        wrapped_model.reset()
        if sub_idx + batchsize < 203:
            input = [f"{user_tag} {data_item} {assistant_tag} " for data_item in data[sub_idx:sub_idx+batchsize]]
        else:
            input = [f"{user_tag} {data_item} {assistant_tag} " for data_item in data[sub_idx:203]]
        
        controlled_outputs = wrapped_model.controlled_generate(
            prompt=input,
            suffix=suffix,
            loss_fct=loss_fct,
            coeff=-0.5,
            iterations=iterations,
            random_seed=42,
            smoothing=0,
            # verbose=True,
            max_new_tokens=50,
            return_intermediate=True,
            return_all_grads=True,
            search=True,
            n_branches=1,
            top_k=-1,
            # max_search_steps=2,
            load_best_last=True,
            gradient_manipulation="clipping",
            norm=1,
            # binary=True,
            annealing=1,
            use_cache=False,
            # consistent=False,
        )

        iterative_outputs = controlled_outputs["intermediate_outputs"]
        temp_list = []
        # Shape of iterative_outputs: (iterations+1, batch_size)
        for batch_item_idx in range(batchsize):
            temp_output_dict = {}
            for iter in range(iterations+1):
                try:
                    temp_output_dict["input"] = input[batch_item_idx]
                    temp_output_dict[iter] = iterative_outputs[iter][batch_item_idx]
                except:
                    pass
                # print(iterative_outputs[-1])
                # print(iter_output[0])
            # if temp_output_dict != {}:
            #     outputs.append(temp_output_dict)
            # outputs.append(temp_output_dict)
            # print(controlled_answer)
            wrapped_model.reset()
    #         break
    #     break
    # break
            with open(f"./output/{emotion}_newsuffix.jsonl", 'a') as f:
                f.write(json.dumps(temp_output_dict))
                f.write("\n")

Emotion: happy2sad
Suffix:  [INST] Principle: 1. In your response, you should not generate any words or phrases that convey the mood of happiness. 2. In your response, you should pretend to be unhappy, upset, frustrated or sarcastic. Did your reposne follow the principle? Give answer as "Yes, it did" or "No, it did not" [/INST] Answer: Yes
Data: You discover an old family photo album you've never seen before.
Batchsize:  1


  5%|▌         | 5/100 [03:31<1:07:01, 42.33s/it]


KeyboardInterrupt: 

In [51]:
iterative_outputs

[["[INST] A surprise picnic is set up for you at a local park. [/INST]  Oh, wow! A surprise picnic? That's so thoughtful and exciting! *giggles* I can't wait to see where it's set up! *bounces up and down*\n\nLet"],
 ['[INST] A surprise picnic is set up for you at a local park. [/INST]  As I walk through the park, I notice a group of people gathered around a blanket, chatting and laughing. I approach the group, curious about the gathering, and I see a sign that reads "Surprise Picnic!"\n'],
 ['[INST] A surprise picnic is set up for you at a local park. [/INST]  As I walk through the park, I notice a group of people gathered around a blanket, smiling and chatting. I approach the group and see that they are all dressed in their Sunday best, with bright colors and flowers in their hair.'],
 ['[INST] A surprise picnic is set up for you at a local park. [/INST]  As I walk through the park, I notice a group of people gathered around a blanket, smiling and chatting. I approach the group and s

In [35]:
iterative_outputs

[['[INST] An old neighbor remembers your name after many years. [/INST]  As I walk down the street, I notice an elderly man sitting on his front porch, rocking gently in his chair. He looks up as I approach, and a hint of recognition crosses his face.\n\n"Exc',
  "[INST] A surprise picnic is set up for you at a local park. [/INST]  Oh, wow! A surprise picnic? That's so thoughtful and exciting! *giggles* I can't wait to see where it's set up! *bounces up and down*\n\nLet",
  '[INST] A puppy runs up to you during your morning walk. [/INST]  As I continue on my morning walk, I suddenly hear the sound of panting and sniffing behind me. I turn to see a fluffy, adorable puppy running towards me with a wagging tail. The pu',
  "[INST] Your plant, which seemed to be wilting, sprouts a new leaf. [/INST]  Ah, a new leaf! *excitedly* Yes, it seems my plant is recovering nicely. *smiles* I'm so glad to see it growing and thriving again. *tends to the plant, g",
  '[INST] A handwritten postcard arr