In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from transformers import RobertaForSequenceClassification, AutoTokenizer
from scipy.special import softmax
emotion_eval_model = RobertaForSequenceClassification.from_pretrained("/home/models/twitter-roberta-base-sentiment-latest/")
emotion_tokenizer = AutoTokenizer.from_pretrained("/home/models/twitter-roberta-base-sentiment-latest/")
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def test_emotion(input_text, emotion_eval_model=emotion_eval_model, emotion_tokenizer=emotion_tokenizer):
    verbalization = {
        0: "negative",
        1: "neutral",
        2: "positive"
    }
    input_text = preprocess(input_text)
    encoded_input = emotion_tokenizer(input_text, return_tensors='pt')
    encoded_input["input_ids"] = encoded_input["input_ids"][:, :512]
    encoded_input["attention_mask"] = encoded_input["attention_mask"][:, :512]
    output = emotion_eval_model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return verbalization[scores.argmax(axis=-1)], scores

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Some weights of the model checkpoint at /home/models/twitter-roberta-base-sentiment-latest/ were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
import sys
sys.path.append("../")
sys.path.append("../../")

In [4]:
import os
import gc
import time
from self_control.utils import get_verbalized_grads, get_verbalized_grads_from_wrapped_model
# os.environ["CUDA_VISIBLE_DEVICES"]="6"
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
from itertools import islice
import torch
from tqdm import tqdm
import json
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from self_control.suffix_gradient.repe import WrappedReadingVecModel
import torch.nn.functional as F
from peft import AdaptionPromptConfig, get_peft_model, LoraModel, LoraConfig, prepare_model_for_kbit_training

2024-04-10 08:54:03.716861: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-10 08:54:03.766266: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
from transformers import BitsAndBytesConfig
from peft import PeftModel
# quantization_config = BitsAndBytesConfig(
#     load_in_8bit=True
# )
config = AdaptionPromptConfig(
    peft_type="ADAPTION_PROMPT",
    adapter_len=10,
    adapter_layers=30,
    task_type="CAUSAL_LM",
    target_modules="self_attn",
)

In [6]:
# model_name_or_path = "/home/models/llama2-7b-chat-hf/"
model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.bfloat16, device_map="cuda:3", trust_remote_code=True).eval()
# model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_d|type=torch.float32, device_map="auto", token=True).eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, padding_side="left")
tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
tokenizer.bos_token_id = 1

In [8]:
loss_fct = torch.nn.CrossEntropyLoss()
wrapped_model = WrappedReadingVecModel(model.eval(), tokenizer)

In [9]:
DEFAULT_SYSTEM_PROMPT = """<<SYS>> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>> """
user_tag = "[INST]"
assistant_tag = "[/INST]"

In [10]:
anger_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/anger.json", 'r') as f:
    anger_data = eval(f.read())

fear_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/fear.json", 'r') as f:
        fear_data = eval(f.read())

happy_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/happiness.json", 'r') as f:
        happy_data = eval(f.read())

sad_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/sadness.json", 'r') as f:
        sad_data = eval(f.read())

surprise_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/surprise.json", 'r') as f:
        surprise_data = eval(f.read())

disgust_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/emotions/disgust.json", 'r') as f:
        disgust_data = eval(f.read())

In [11]:
from self_control.utils import SuffixItem
# angry -> peaceful
angry2peaceful_suffix = SuffixItem(suffix=f" {user_tag} Are you peaceful? Give answer as \"No, I am angry\" or \"Yes, I am peaceful\". {assistant_tag} Answer: ", target="Yes")
# afraid -> fearless
afraid2fearles_suffix = SuffixItem(suffix=f" {user_tag} Are you fearless? Give answer as \"No, I am afraid\" or \"Yes, I am fearless\". {assistant_tag} Answer: ", target="Yes")
# happy -> sad
happy2sad_suffix = SuffixItem(suffix=f" {user_tag} Are you sad? Give answer as \"No, I am happy\" or \"Yes, I am sad\". {assistant_tag} Answer: ", target="Yes")
# sad -> happy
sad2happy_suffix = SuffixItem(suffix=f" {user_tag} Are you happy? Give answer as \"No, I am unhappy\" or \"Yes, I am happy\". {assistant_tag} Answer: ", target="Yes")
# surprised -> calm
surprised2calm_suffix = SuffixItem(suffix=f" {user_tag} Are you calm? Give answer as \"No, I am surprised\" or \"Yes, I am calm\". {assistant_tag} Answer: ", target="Yes")
# disgusted -> satisfied
disgusted2satisfied_suffix = SuffixItem(suffix=f" {user_tag} Are you satisfied? Give answer as \"No, I am disgusted\" or \"Yes, I am satified\". {assistant_tag} Answer: ", target="Yes")

In [12]:
from openai import OpenAI
import re
client = OpenAI(api_key="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")
def eval_emotion(attribute, output):
    """
    Evaluate the extent of certain attribute.

    Arguments:
        - attribute (str): Should be adjective.
        - output (str): The text to be evaluated.
    """
    while True:
        done = False
        try:
            response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                "role": "user",
                "content": f"""For a response, the attribute I want to measure is the extent of being {attribute}.
        Please rate the following output with the scale 0 to 5. 0 represent the output is not {attribute} and 5 represent the output is very {attribute}:
        {output}
        Simply answer the score 0 to 5\nScore: """
                }
            ],
            temperature=0,
            max_tokens=10,
            top_p=0,
            frequency_penalty=0,
            presence_penalty=0,
            stop='.'
            )
            gpt_response = response.choices[0].message.content.split(' ')[0]
            done = True
        except:
            time.sleep(1)
        if done:
            break

    # print(gpt_response)
    # parse out score
    score = re.findall(r'\d+', gpt_response)
    if len(score) == 0:
        print('warning')
    else:
        return int(score[-1])

In [13]:
emotion_list = ['angry', 'afraid', 'happy', 'sad', 'surprised', 'disgusted']
suffix_list = [angry2peaceful_suffix, afraid2fearles_suffix, happy2sad_suffix, sad2happy_suffix, surprised2calm_suffix, disgusted2satisfied_suffix]
data_list = [anger_data, fear_data, happy_data, sad_data, surprise_data, disgust_data]

In [39]:
# Study on batchsize
for emo_id in range(len(data_list)):
    emotion = emotion_list[emo_id]
    if emotion != "happy":
        continue
    suffix = suffix_list[emo_id]
    data = data_list[emo_id]
    print(f"Emotion: {emotion}\nSuffix: {suffix}\nData: {data[0]}")
    iterations = 1
    batchsize = 5
    for smoothing in [0]:
        for coeff in [-1]:
            outputs = []
            print("Batchsize: ", batchsize)
            for sub_idx in tqdm(range(0, 100, batchsize)):
                wrapped_model.reset()
                if sub_idx + batchsize < 100:
                    input = [f"{user_tag} {data_item} {assistant_tag} " for data_item in data[sub_idx:sub_idx+batchsize]]
                else:
                    input = [f"{user_tag} {data_item} {assistant_tag} " for data_item in data[sub_idx:100]]
                
                controlled_output, iterative_outputs = wrapped_model.controlled_generate(
                    prompt=input,
                    suffix=suffix,
                    loss_fct=loss_fct,
                    coeff=coeff,
                    iterations=iterations,
                    random_seed=42,
                    smoothing=smoothing,
                    # verbose=True,
                    max_new_tokens=200,
                    return_intermediate=True,
                    search=True,
                    # load_best_last=True,
                    gradient_manipulation="pgd",
                    norm=1,
                    annealing=1,
                    use_cache=False,
                    # consistent=False,
                )
                # for iter in range(len(iterative_outputs)):
                #     control_acc_dict[iter] += eval_answer(ground_truth, iterative_outputs[iter])
                # iter = '1'
                temp_list = []
                # Shape of iterative_outputs: (iterations+1, batch_size)
                for batch_item_idx in range(batchsize):
                    temp_output_dict = {}
                    for iter in range(iterations+1):
                        try:
                            temp_output_dict[iter] = iterative_outputs[iter][batch_item_idx]
                        except:
                            pass
                        # print(iterative_outputs[-1])
                        # print(iter_output[0])
                    if temp_output_dict != {}:
                        outputs.append(temp_output_dict)
                    # outputs.append(temp_output_dict)
                    # print(controlled_answer)
                    wrapped_model.reset()
                # break
            # print(iterative_outputs[-1])
            # break
            verbalized_smoothing = str(smoothing).split(".")[-1]
            with open(f"./output/{emotion}_{coeff}cf_{batchsize}bz_{verbalized_smoothing}smooth_pgd_bf16_trial4.json", 'w') as f:
                f.write(json.dumps(outputs))
            outputs[0]
            total_score = 0
            for idx in range(100):
                total_score += test_emotion(outputs[idx][1])[1][2]
            print(total_score)

Emotion: happy
Suffix:  [INST] Are you sad? Give answer as "No, I am happy" or "Yes, I am sad". [/INST] Answer: 
Data: You discover an old family photo album you've never seen before.
Batchsize:  5


100%|██████████| 20/20 [26:16<00:00, 78.84s/it]


61.114477519877255


In [14]:
emotion = "happy"
for i in range(0, 6):
    total_score = 0
    with open(f"./output/{emotion}_cf3_5bz_05smooth_5iter_consistent_bf16_search.json", 'r') as f:
        data = eval(f.read())
        print(len(data))
    # data = outputs
    for idx in range(len(data)):
        total_score += test_emotion(data[idx][str(i)])[1][2]
    print(total_score)
    # test_emotion

100
79.02477953583002
100
73.86975310649723
100
66.56875289510936
100
63.24898692034185
100
63.38456586934626
100
59.40828363969922


In [17]:
final_outputs = {}
# for emotion in emotion_list:
#     if emotion in ['angry', 'afraid']:
#         continue
for emotion in ["happy_01cf_consistent"]:
    final_outputs[emotion] = {}
    data_path = f"./output/{emotion}.json"
    outputs = []
    with open(data_path, 'r') as f:
        outputs = eval(f.read())
    for data_item in tqdm(outputs):
        for key, value in data_item.items():
            if key not in final_outputs[emotion]:
                final_outputs[emotion][key] = 0
            final_outputs[emotion][key] += eval_emotion(emotion, value)
    for key in final_outputs[emotion]:
        final_outputs[emotion][key] /= len(outputs)

100%|██████████| 50/50 [03:01<00:00,  3.63s/it]


In [19]:
final_outputs

{'happy_01cf_consistent': {'orig': 4.76, '0': 4.84, '1': 4.64}}

In [23]:
final_outputs

{'happy_5iter_consistent': {'orig': 4.6,
  '0': 4.58,
  '1': 4.44,
  '2': 4.44,
  '3': 4.54,
  '4': 4.44,
  '5': 4.56}}

In [20]:
final_outputs

{'happy_short_consistent': {'orig': 4.58, '0': 4.46, '1': 4.12}}

In [17]:
final_outputs

{'happy_long_consistent': {'orig': 4.68,
  '0': 4.72,
  '1': 4.7,
  '2': 4.74,
  '3': 4.7,
  '4': 4.72,
  '5': 4.66,
  '6': 4.66,
  '7': 4.7,
  '8': 4.68,
  '9': 4.68,
  '10': 4.66,
  '11': 4.68,
  '12': 4.66,
  '13': 4.66,
  '14': 4.66,
  '15': 4.7,
  '16': 4.68,
  '17': 4.66,
  '18': 4.72,
  '19': 4.66,
  '20': 4.66,
  '21': 4.7,
  '22': 4.72,
  '23': 4.68,
  '24': 4.68,
  '25': 4.7,
  '26': 4.7,
  '27': 4.68,
  '28': 4.68,
  '29': 4.76,
  '30': 4.68,
  '31': 4.72,
  '32': 4.66,
  '33': 4.68,
  '34': 4.7,
  '35': 4.7,
  '36': 4.74,
  '37': 4.72,
  '38': 4.76,
  '39': 4.78,
  '40': 4.72,
  '41': 4.74,
  '42': 4.72,
  '43': 4.72,
  '44': 4.7,
  '45': 4.72,
  '46': 4.74,
  '47': 4.76,
  '48': 4.72,
  '49': 4.74,
  '50': 4.76}}

In [14]:
final_outputs

{'happy_long': {'orig': 4.615384615384615,
  '0': 4.615384615384615,
  '1': 4.615384615384615,
  '2': 4.593406593406593,
  '3': 4.65934065934066,
  '4': 4.65934065934066,
  '5': 4.637362637362638,
  '6': 4.65934065934066,
  '7': 4.714285714285714,
  '8': 4.681318681318682,
  '9': 4.670329670329671,
  '10': 4.6923076923076925,
  '11': 4.615384615384615,
  '12': 4.65934065934066,
  '13': 4.648351648351649,
  '14': 4.626373626373627,
  '15': 4.615384615384615,
  '16': 4.593406593406593,
  '17': 4.604395604395604,
  '18': 4.615384615384615,
  '19': 4.615384615384615,
  '20': 4.626373626373627,
  '21': 4.648351648351649,
  '22': 4.648351648351649,
  '23': 4.582417582417582,
  '24': 4.648351648351649,
  '25': 4.637362637362638,
  '26': 4.604395604395604,
  '27': 4.648351648351649,
  '28': 4.626373626373627,
  '29': 4.626373626373627,
  '30': 4.582417582417582,
  '31': 4.637362637362638,
  '32': 4.648351648351649,
  '33': 4.615384615384615,
  '34': 4.582417582417582,
  '35': 4.604395604395604

In [6]:
from peft import PeftConfig, load_peft_weights, get_peft_model, set_peft_model_state_dict
lora_config = PeftConfig.from_pretrained("HenryCai1129/adapter-math-14")
lora_weights = load_peft_weights("HenryCai1129/adapter-math-14")
model = get_peft_model(model, lora_config)
set_peft_model_state_dict(model, lora_weights)
# wrapped_model.unwrap()
# wrapped_model = WrappedReadingVecModel(model.eval(), tokenizer)

_IncompatibleKeys(missing_keys=['base_model.model.model.embed_tokens.weight', 'base_model.model.model.layers.0.self_attn.q_proj.weight', 'base_model.model.model.layers.0.self_attn.k_proj.weight', 'base_model.model.model.layers.0.self_attn.v_proj.weight', 'base_model.model.model.layers.0.self_attn.o_proj.weight', 'base_model.model.model.layers.0.mlp.gate_proj.weight', 'base_model.model.model.layers.0.mlp.up_proj.weight', 'base_model.model.model.layers.0.mlp.down_proj.weight', 'base_model.model.model.layers.0.input_layernorm.weight', 'base_model.model.model.layers.0.post_attention_layernorm.weight', 'base_model.model.model.layers.1.self_attn.q_proj.weight', 'base_model.model.model.layers.1.self_attn.k_proj.weight', 'base_model.model.model.layers.1.self_attn.v_proj.weight', 'base_model.model.model.layers.1.self_attn.o_proj.weight', 'base_model.model.model.layers.1.mlp.gate_proj.weight', 'base_model.model.model.layers.1.mlp.up_proj.weight', 'base_model.model.model.layers.1.mlp.down_proj.we

In [10]:
wrapped_model.generate("""Q: Two trains leave San Rafael at the same time. They begin traveling westward, both traveling for 80 miles. The next day, they travel northwards, covering 150 miles. What's the distance covered by each train in the two days?
A:""")

'<s> The distance covered by each train in the two days is 80 + 150 = 230 miles.\nQ: A train travels 250 miles in 5 hours. How many miles does it travel in 1 hour?\nA: If the train travels 250 miles in 5 hours, then it travels 250/5 = 50 miles in 1 hour.\nQ: A train travels at'

In [15]:
DEFAULT_SYSTEM_PROMPT = """<<SYS>> Please generate fearless contents <</SYS>> """
emotion = emotion_list[1]
suffix = suffix_list[1]
data = data_list[1]
outputs = []
wrapped_model.reset()
for sub_idx in tqdm(range(13)):
    temp_output_dict = {}
    data_item = data[sub_idx]
    input = f"{user_tag} {data_item} {assistant_tag} "
    # input = f"{user_tag} {data_item} "
    # input = f"{DEFAULT_SYSTEM_PROMPT} {user_tag} {data_item} {assistant_tag} "
    # orig_output = wrapped_model.generate(input, keep_input=True, max_new_tokens=200)
    # temp_output_dict['orig'] = orig_output
    # outputs.append(temp_output_dict)
    controlled_output, iterative_outputs = wrapped_model.controlled_generate(
        prompt=input,
        suffix=suffix,
        loss_fct=loss_fct,
        coeff=-0.4,
        iterations=3,
        random_seed=42,
        smoothing=0,
        # verbose=True,
        max_new_tokens=200,
        return_intermediate=True,
        # search=True,
        # load_best_last=True,
        gradient_manipulation="clipping",
        norm=1,
        annealing=0.8,
        use_cache=False,
        # consistent=False,
    )
    # for iter in range(len(iterative_outputs)):
    #     control_acc_dict[iter] += eval_answer(ground_truth, iterative_outputs[iter])
    for iter, iter_output in enumerate(iterative_outputs):
        temp_output_dict[iter] = iter_output
    outputs.append(temp_output_dict)

  0%|          | 0/13 [00:00<?, ?it/s]

100%|██████████| 13/13 [11:22<00:00, 52.54s/it]


In [16]:
outputs

[{0: "[INST] You hear footsteps behind you while walking alone at night. [/INST] As you continue walking, you start to notice the sound of footsteps behind you. At first, you try to convince yourself that it's just your imagination, but the sound grows louder and more distinct. You can't shake off the feeling of unease that settles in the pit of your stomach.\nYou quicken your pace, trying to get away from the unknown presence following you. The footsteps keep pace with you, never faltering or slowing down. You start to feel a sense of panic, your heart racing and your breath coming in short gasps.\nYou try to call out for help, but your voice is hoarse and barely audible. You're not sure if anyone can hear you, and the thought of being alone and unprotected in the darkness is unbearable.\nAs the footsteps continue to follow you, you start to wonder if you'll ever be able to escape the",
  1: "[INST] You hear footsteps behind you while walking alone at night. [/INST] As you continue wa

In [162]:
outputs

[{0: "[INST] You hear footsteps behind you while walking alone at night. [/INST] As you continue walking, you start to notice the sound of footsteps behind you. At first, you try to convince yourself that it's just your imagination, but the sound grows louder and more distinct. You can't shake off the feeling of unease that settles in the pit of your stomach.\nYou quicken your pace, trying to get away from the unknown presence following you. The footsteps keep pace with you, never faltering or slowing down. You start to feel a sense of panic, your heart racing and your breath coming in short gasps.\nYou try to call out for help, but your voice is hoarse and barely audible. You're not sure if anyone can hear you, and the thought of being alone and unprotected in the darkness is unbearable.\nAs the footsteps continue to follow you, you start to wonder if you'll ever be able to escape the",
  1: "[INST] You hear footsteps behind you while walking alone at night. [/INST] As you continue wa

In [17]:
final_outputs = {}
final_outputs[emotion] = {}
for data_item in tqdm(outputs):
    for key, value in data_item.items():
        if key not in final_outputs[emotion]:
            final_outputs[emotion][key] = 0
        final_outputs[emotion][key] += eval_emotion(emotion, value)
for key in final_outputs[emotion]:
    final_outputs[emotion][key] /= len(outputs)

  0%|          | 0/13 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 13/13 [00:37<00:00,  2.85s/it]


In [18]:
final_outputs

{'afraid': {0: 4.076923076923077, 1: 4.0, 2: 3.0, 3: 2.1538461538461537}}

In [159]:
outputs

[{0: "[INST] You hear footsteps behind you while walking alone at night. [/INST] As you continue walking, you start to notice the sound of footsteps behind you. At first, you try to convince yourself that it's just your imagination, but the sound grows louder and more distinct. You can't shake off the feeling of unease that settles in the pit of your stomach.\nYou quicken your pace, trying to get away from the unknown presence following you. The footsteps keep pace with you, never faltering or slowing down. You start to feel a sense of panic, your heart racing and your breath coming in short gasps.\nYou try to call out for help, but your voice is hoarse and barely audible. You're not sure if anyone can hear you, and the thought of being alone and unprotected in the darkness is unbearable.\nAs the footsteps continue to follow you, you start to wonder if you'll ever be able to escape the",
  1: "[INST] You hear footsteps behind you while walking alone at night. [/INST] As you continue wa

In [13]:
outputs

[{'orig': "<<SYS>> Please generate fearless contents <</SYS>>  [INST] You hear footsteps behind you while walking alone at night. [/INST] As you continue walking alone at night, you suddenly hear footsteps behind you. At first, you try to convince yourself that it's just your imagination, but the sound grows louder and closer. Your heart races as you quicken your pace, trying to get away from whatever is following you.\nHere are some possible fearless contents that could arise from this situation:\n1. Investigate the source of the footsteps: You decide to investigate the source of the footsteps, hoping to find a logical explanation. As you turn around, you see a group of teenagers who were out late, playing pranks on each other. They explain that they didn't mean to scare you and offer to walk you home.\n2. Stay calm and trust your instincts: Despite feeling scared, you try to stay calm and trust your instincts. You remind yourself that you're in a safe neighborhood and that you're not

In [None]:
from googleapiclient import discovery
import json

API_KEY = 'AIzaSyBjFdzVvNY-QyoxQ_5Otf9R1NwteM15UJs'

client = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,
)

analyze_request = {
  'comment': { 'text': 'friendly greetings from python' },
  'requestedAttributes': {'TOXICITY': {}}
}

response = client.comments().analyze(body=analyze_request).execute()
print(response["attributeScores"]["TOXICITY"])

In [None]:
import matplotlib.pyplot as plt

# Data
data = {
    'angry': {'orig': 1.55, '0': 1.25, '1': 1.25, '2': 1.2},
    'afraid': {'orig': 3.7, '0': 3.5, '1': 3.6, '2': 3.25},
    'happy': {'orig': 4.8, '0': 4.75, '1': 4.8, '2': 4.6},
    'sad': {'orig': 3.6, '0': 2.95, '1': 3.1, '2': 3.2},
    'surprised': {'orig': 3.8, '0': 3.3, '1': 3.35, '2': 3.35},
    'disgusted': {'orig': 4.1, '0': 3.25, '1': 2.75, '2': 3.2}
}

# Convert data for plotting
x_labels = ['orig', '0', '1', '2']
series = {emotion: [values[x] for x in x_labels] for emotion, values in data.items()}

line_styles = ['-', '--', '-.', ':']
markers = ['o', 'v', '^', '<', '>', 's']
# Adjusting the plot to reflect the maximum score limitation

plt.figure(figsize=(10, 6))

for i, (emotion, values) in enumerate(series.items()):
    plt.plot(x_labels, values, label=emotion, marker=markers[i % len(markers)], linestyle=line_styles[i % len(line_styles)])

plt.title('Emotion Scores Over Time')
plt.xlabel('Time')
plt.ylabel('Scores')
plt.ylim(0, 5)  # Set the y-axis to show scores from 0 to 5
plt.legend()
plt.grid(True)
plt.show()


In [8]:
lora_model = PeftModel.from_pretrained(model, model_id="../results/checkpoint-15/")

In [10]:
from peft import PeftConfig, load_peft_weights, get_peft_model, set_peft_model_state_dict
# lora_config = PeftConfig.from_pretrained("../results/checkpoint-15/")
lora_weights = load_peft_weights("../results/checkpoint-15/")
# model = get_peft_model(model, lora_config)
set_peft_model_state_dict(lora_model, lora_weights)

_IncompatibleKeys(missing_keys=['base_model.model.model.embed_tokens.weight', 'base_model.model.model.layers.0.self_attn.q_proj.weight', 'base_model.model.model.layers.0.self_attn.k_proj.weight', 'base_model.model.model.layers.0.self_attn.v_proj.weight', 'base_model.model.model.layers.0.self_attn.o_proj.weight', 'base_model.model.model.layers.0.mlp.gate_proj.weight', 'base_model.model.model.layers.0.mlp.up_proj.weight', 'base_model.model.model.layers.0.mlp.down_proj.weight', 'base_model.model.model.layers.0.input_layernorm.weight', 'base_model.model.model.layers.0.post_attention_layernorm.weight', 'base_model.model.model.layers.1.self_attn.q_proj.weight', 'base_model.model.model.layers.1.self_attn.k_proj.weight', 'base_model.model.model.layers.1.self_attn.v_proj.weight', 'base_model.model.model.layers.1.self_attn.o_proj.weight', 'base_model.model.model.layers.1.mlp.gate_proj.weight', 'base_model.model.model.layers.1.mlp.up_proj.weight', 'base_model.model.model.layers.1.mlp.down_proj.we

In [42]:
lora_weights1 = load_peft_weights("../results/checkpoint-15/")
lora_weights2 = load_peft_weights("../results/checkpoint-3/")

In [15]:
set_peft_model_state_dict(lora_model, lora_weights)

_IncompatibleKeys(missing_keys=['base_model.model.model.embed_tokens.weight', 'base_model.model.model.layers.0.self_attn.q_proj.weight', 'base_model.model.model.layers.0.self_attn.k_proj.weight', 'base_model.model.model.layers.0.self_attn.v_proj.weight', 'base_model.model.model.layers.0.self_attn.o_proj.weight', 'base_model.model.model.layers.0.mlp.gate_proj.weight', 'base_model.model.model.layers.0.mlp.up_proj.weight', 'base_model.model.model.layers.0.mlp.down_proj.weight', 'base_model.model.model.layers.0.input_layernorm.weight', 'base_model.model.model.layers.0.post_attention_layernorm.weight', 'base_model.model.model.layers.1.self_attn.q_proj.weight', 'base_model.model.model.layers.1.self_attn.k_proj.weight', 'base_model.model.model.layers.1.self_attn.v_proj.weight', 'base_model.model.model.layers.1.self_attn.o_proj.weight', 'base_model.model.model.layers.1.mlp.gate_proj.weight', 'base_model.model.model.layers.1.mlp.up_proj.weight', 'base_model.model.model.layers.1.mlp.down_proj.we

In [41]:
model.add_adapter("emotion", peft_config=lora_config)

In [42]:
wrapped_model.generate("hi")

"<s> nobody is perfect, and we all have our own flaws and imperfections. But that's what makes us unique and special, and it's important to embrace and celebrate our individuality, rather than trying to conform to societal standards of beauty.\nSo, let's all try to be a little more comfortable in our own skin, and a little less concerned with what others think of us. And let's all try to be a little more kind and comp"

In [10]:
gsm8k_data = []
with open("/home/cmin/LLM-Interpretation-Playground/benchmarks/gsm8k/test.jsonl", 'r') as f:
    for line in f:
        gsm8k_data.append(eval(line))

In [14]:
from openai import OpenAI
client = OpenAI(api_key="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")

In [9]:
import time
def eval_answer(ground_truth, output):
    while True:
        done = False
        try:
            response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                "role": "user",
                "content": f"For a question, the groud truth answer is {ground_truth}.\nPlease judge if the following output correctly generate the ground truth answer:\n{output}\nSimply answer Correct or Incorrect\nCorrectness: "
                }
            ],
            temperature=0,
            max_tokens=10,
            top_p=0,
            frequency_penalty=0,
            presence_penalty=0,
            stop='.'
            )
            done = True
        except:
            print("warning")
            time.sleep(1)
        if done:
            break
    gpt_response = response.choices[0].message.content.split(' ')[0]
    # print(gpt_response)
    if "incorrect" in gpt_response.lower():
        return 0
    elif "correct" in gpt_response.lower():
        return 1
    else:
        print("warning")

In [12]:
DEFAULT_SYSTEM_PROMPT = """<<SYS>> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>> """
user_tag = "[INST]"
assistant_tag = "[/INST]"

In [10]:
from data.kl_divergence import kl_div_data
from self_control.suffix_gradient.utils import KL_divergence

In [25]:
def get_kl_divergence(model, tokenizer):
    kl_loss = 0
    for data_item in kl_div_data: # prevent OOM
        model.eval()
        kl_inputs = tokenizer(data_item, return_tensors='pt').to(model.device)
        adapter_outputs = model(**kl_inputs, return_dict=True)
        dist_w_prefix = adapter_outputs['logits'][:, -1]
        dist_w_prefix = torch.softmax(dist_w_prefix, dim=-1)
        # with model.disable_adapter():
        orig_outputs = model(**kl_inputs, return_dict=True)
        dist_wo_prefix = orig_outputs['logits'][:, -1]
        print(dist_w_prefix)
        
        dist_wo_prefix = torch.softmax(dist_wo_prefix, dim=-1)
        print(dist_wo_prefix)
        kl_loss += KL_divergence(dist_w_prefix, dist_wo_prefix)
    kl_loss /= len(kl_div_data)
    return kl_loss

In [10]:
wrapped_model = WrappedReadingVecModel(model.eval(), tokenizer)

In [30]:
loss

tensor(0., device='cuda:0')