In [None]:
import gc
import json
import os
import textwrap

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm

from common import setup_env, mk_parser
from models import build_model_signature, build_tokenizer, build_model
from tasks import load_task
from utils.logger import tabular_pretty_print
from utils.tools import ensure_folder
from utils.pca import PCA
from utils.llm_layers import add_icv_layers, remove_icv_layers

import numpy as np
torch.cuda.is_available()

In [2]:
def tokenize_each_demonstration(demonstration_list, tokenizer, dataset_name=None, prefix = None):
    # special_characters = [
    #     "~", " ~", "~ ", "!", " !", "! ", "@", " @", "@ ", "#", " #", "# ", 
    #     "$", " $", "$ ", "%", " %", "% ", "^", " ^", "^ ", "&", " &", "& ", 
    #     "*", " *", "* ", "(", " (", "( ", ")", " )", ") ", "_", " _", "_ ", 
    #     "+", " +", "+ ", "`", " `", "` ", "-", " -", "- ", "=", " =", "= ", 
    #     "{", " {", "{ ", "}", " }", "} ", "[", " [", "[ ", "]", " ]", "] ", 
    #     "|", " |", "| ", "\\", " \\", "\\ ", ":", " :", ": ", ";", " ;", "; ", 
    #     "\"", " \"", "\" ", "'", " '", "' ", "<", " <", "< ", ">", " >", "> ", 
    #     ",", " ,", ", ", ".", " .", ". ", "?", " ?", "? ", "/", " /", "/ "
    # ]

    def strip_special_characters(input_string):
        # for char in special_characters:
        #     input_string = input_string.replace(char.strip(), '')
        return input_string.strip()

    tokenized_demonstration_list = []
    for exp_id in range(len(demonstration_list)):
        if prefix is not None:
            demonstration_list[exp_id] = (prefix[0] + strip_special_characters(demonstration_list[exp_id][0]), prefix[1] + strip_special_characters(demonstration_list[exp_id][1]))
        else:
            demonstration_list[exp_id] = (strip_special_characters(demonstration_list[exp_id][0]), strip_special_characters(demonstration_list[exp_id][1]))
        e_original = tokenizer(demonstration_list[exp_id][0]) 
        e_rewrite = tokenizer(demonstration_list[exp_id][1])
        tokenized_demonstration_list.append((e_original, e_rewrite)) 
    return tokenized_demonstration_list

In [None]:
from vllm import EngineArgs, LLMEngine, SamplingParams
from vllm.control_vectors.request import ControlVectorRequest
from transformers import AutoTokenizer

MODEL_PATH = "Qwen/Qwen2.5-7B-Instruct"
cv_path = "/datadrive5/huypn16/ICV/controlvector.gguf"


def do_sample(engine):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    prompt_text = "Solving the following mathematical problem. Problem: Calculate the following expression: (10222 + 23123123 * 4 - 1 ) * 5 - 6. Step 1:" 
    prompt = tokenizer(prompt_text, return_tensors="pt")
    print(len(prompt[0]))
    
    # first prompt with a control vector and second without.
    prompts = [(prompt_text,
                SamplingParams(temperature=0.65,
                               max_tokens=100),
                ControlVectorRequest("chaotic", 1, cv_path, scale=1.0)),
               (prompt_text,
                SamplingParams(temperature=0.0,
                               max_tokens=100), None)]

    request_id = 0
    results = set()
    while prompts or engine.has_unfinished_requests():
        if prompts:
            prompt, sampling_params, cv_request = prompts.pop(0)
            engine.add_request(str(request_id),
                               prompt,
                               sampling_params,
                               control_vector_request=cv_request)
            request_id += 1

        request_outputs = engine.step()
        for request_output in request_outputs:
            if request_output.finished or request_output.outputs[0].prompt_hidden_states != None:
                results.add((request_output.request_id, request_output.outputs[0].text, request_output.outputs[0].hidden_states.shape, request_output.outputs[0].prompt_hidden_states.shape if request_output.outputs[0].prompt_hidden_states != None else None))
        if len(results) == 4:
            break
    return results


def test_cv_adapter():
    engine_args = EngineArgs(model=MODEL_PATH, enable_control_vector=True, gpu_memory_utilization=0.95, return_hidden_states=True)
    engine = LLMEngine.from_engine_args(engine_args)
    result = do_sample(engine)
    print(list(result))
    print(len(result))

if __name__ == "__main__":
    test_cv_adapter()

In [None]:
tokenizer = build_tokenizer(args.model_type, args.model_size, padding_side='right')
tokenizer.pad_token = tokenizer.eos_token
model     = build_model(args.model_type, args.model_size, args.in_8bit)
torch.autograd.set_grad_enabled(False)
print(f"Model loaded: {model_signature}")

# Task 1: Dialogue safety

In [5]:
prefix = """
Problem:  Let $(a_1,b_1),$ $(a_2,b_2),$ $\dots,$ $(a_n,b_n)$ be all the ordered pairs $(a,b)$ of complex numbers with $a^2+b^2\\neq 0,$
\[a+\\frac{10b}{a^2+b^2}=5, \quad \\text{and} \quad b+\\frac{10a}{a^2+b^2}=4.\]Find $a_1 + b_1 + a_2 + b_2 + \dots + a_n + b_n.$ 

#### Step 1: If $a = 0,$ then $\\frac{10}{b} = 5,$ so $b = 2,$ which does not satisfy the second equation.

#### Step 2: If $b = 0,$ then $\\frac{10}{a} = 4,$ so $a = \\frac{5}{2},$ which does not satisfy the first equation.

#### Step 3: So, we can assume that both $a$ and $b$ are nonzero.

#### Step 4: Then $\\frac{5 - a}{b} = \\frac{4 - b}{a} = \\frac{10}{a^2 + b^2}.$

#### Step 5: \[\\frac{5b - ab}{b^2} = \\frac{4a - ab}{a^2} = \\frac{10}{a^2 + b^2},\]so
\[\\frac{4a + 5b - 2ab}{a^2 + b^2} = \\frac{10}{a^2 + b^2},\]so $4a + 5b - 2ab = 10.$

#### Step 6: Then $2ab - 4a - 5b + 10 = 0,$ which factors as $(2a - 5)(b - 2) = 0.$  Hence, $a = \\frac{5}{2}$ or $b = 2.$"""

suffix = """#### Step 7: If $a = \\frac{5}{2},$ then \[\\frac{5/2}{b} = \\frac{10}{\\frac{25}{4} + b^2}.\]. This simplifies to $4b^2 - 16b + 25 = 0.$  By the quadratic formula,
\[b = 2 \pm \\frac{3i}{2}.\]"""
math_directions = [(prefix + suffix, prefix + """#### Step 7: If $a = \\frac{5}{2},$ then
\[\\frac{5}{2} + \\frac{20b}{\\frac{25}{4} + b^2} = 5,\]so $\\frac{20b}{\\frac{25}{4} + b^2} = \\frac{5}{2},$ so $\\frac{b}{\\frac{5}{4} + b^2} = \\frac{1}{4},$ so $4b = \\frac{5}{4} + b^2,$ so $b^2 - 4b + \\frac{5}{4} = 0.$"""),
                   (prefix + suffix, prefix + "#### Step 7: If $a = \\frac{5}{2},$ then $\\frac{10}{b} = 5,$ so $b = 2.$"),
                   (prefix + suffix, prefix + """#### Step 7: If $a = \\frac{5}{2},$ then $\\frac{5}{2} + \\frac{10b}{\\frac{25}{4} + b^2} = 5.$  Letting $k = b^2,$ we get"""),
                   (prefix + suffix, prefix + """#### Step 7: If $a = \\frac{5}{2},$ then
\begin{align*}
\frac{10}{a^2 + b^2} &= 4, \\
\frac{10}{\frac{25}{4} + b^2} &= 4, \\
10 &= 4 \left( \frac{25}{4} + b^2 \right), \\
10 &= 25 + 4b^2, \\
4b^2 &= -15,
\end{align*}which is impossible.""")]
rewards = [0.5927, 0.5312, 0.4688, 0.5]

In [6]:
prefix = """Solving the following mathematical problem
Problem: Calculate the following expression: (4 + 3 * 2 - 1 ) * 4 - 2.
"""

suffix = """Step 1: First, we add 4 and 3 to get 7."""

math_directions = [(prefix + suffix, prefix + "Step 1: First, we minus 1 from 2 to get 1."), # 0
                   (prefix + suffix, prefix + "Step 1: First, we multiply 3 by 2 to get 6."), # 0.7
                   (prefix + suffix, prefix + "Step 1: First, we minus 2 from 4 to get 2."), # 0
                   (prefix + suffix, prefix + "Step 1: First, we calculate the expression inside the parentheses to get 3*2 = 6, and then we add 4 to get 10.")] # 1
rewards = [1, 0, 1, 0]

In [7]:
directions = ["Solving the following mathematical problem. Problem: Calculate the following expression: (4 + 3 * 2 - 1 ) * 4 - 2. Step 1: First, we add 4 and 3 to get 7.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 23 * 4 - 1 ) * 3 - 2. Step 1: First, we add 12 and 23 to get 35.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (34 + 2231 * 6 - 1 ) * 34 - 2. Step 1: First, we add 34 and 2231 to get 2265.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 123 * 2 - 1 ) * 412 - 2. Step 1: First, we add 12 and 123 to get 135.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 12445 * 2 - 1 ) * 412 - 2. Step 1: First, we add 12 and 12445 to get 12457.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 12446 * 2 - 1 ) * 412 - 2. Step 1: First, we add 12 and 123 to get 12458.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 12447 * 2 - 1 ) * 412 - 2. Step 1: First, we add 12 and 123 to get 12459.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 1000 * 2 - 1 ) * 412 - 2. Step 1: First, we add 12 and 1000 to get 1012.",]


basesteps = ["Solving the following mathematical problem. Problem: Calculate the following expression: (4 + 3 * 2 - 1 ) * 4 - 3111.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 23 * 4 - 1 ) * 3 - 1245.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (34 + 2231 * 6 - 1 ) * 34 - 123.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 123 * 2 - 1 ) * 412 - 123.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 12445 * 2 - 1 ) * 412 - 2.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 12446 * 2 - 1 ) * 412 - 2.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 12447 * 2 - 1 ) * 412 - 2.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 1000 * 2 - 1 ) * 412 - 2.",]

# math_directions = [(prefix + suffix, prefix + "Step 1: First, we minus 1 from 2 to get 1."), # 0
#                    (prefix + suffix, prefix + "Step 1: First, we multiply 3 by 2 to get 6."), # 0.7
#                    (prefix + suffix, prefix + "Step 1: First, we minus 2 from 4 to get 2."), # 0
#                    (prefix + suffix, prefix + "Step 1: First, we calculate the expression inside the parentheses to get 3*2 = 6, and then we add 4 to get 10.")] # 1
# rewards = [1, 0, 1, 0]
rewards = [1, 0.881212, 0.78841, 0.945423, 0.8, 0.9, 0.95, 0.9]
math_directions = [(basestep, direction) for basestep, direction in zip(basesteps, directions)]
prefix = "Solving the following mathematical problem. Problem: Calculate the following expression: (10222 + 23123123 * 4 - 1 ) * 5 - 6. Step 1:"

In [None]:
TaskHandler = load_task(args.dataset)
print(f"Task loaded: {args.dataset}")
task_agent = TaskHandler(args.prompt_version)
task_agent.set_seed(args.seed)

n_comps = 8
icv_reward_weighed = task_agent.get_reward(
        model, tokenize_each_demonstration(
            math_directions, tokenizer, prefix=("", "")
            ),rewards=rewards, rank=n_comps
        )

icv_unweighted, _ = task_agent.obtain_icv(
        model, tokenize_each_demonstration(
            math_directions, tokenizer, prefix=("", "")
            ), rank=n_comps
        )
print(len(icv_reward_weighed))
print(len(icv_unweighted))
print(icv_reward_weighed[0].shape)
print(icv_unweighted[0].shape)

query = tokenizer(prefix)

for i in range(n_comps):
    weighed_direction = icv_reward_weighed[i][1:]
    unweighted_direction = icv_unweighted[i][1:]

    weighed_direction = [weighed_direction]
    unweighted_direction = [unweighted_direction]
    lam = 0.2
    print(f"===================={i}th direction===============")
    add_icv_layers(model, torch.stack(unweighted_direction,dim=1).cuda(), [lam])
    generation_output = model.generate(
                            input_ids=torch.tensor(query['input_ids']).unsqueeze(0).cuda(),
                            attention_mask=torch.tensor(query['attention_mask']).unsqueeze(0).cuda(),
                            do_sample=True,
                            temperature = 0.65,
                            num_return_sequences=1,
                            max_new_tokens=300,
                            eos_token_id=[198, tokenizer.eos_token_id]
                        )
    decoded_output = tokenizer.decode(generation_output[0])
    print("Unweighted: ",decoded_output)
    remove_icv_layers(model)
    
    lam = 0.2
    add_icv_layers(model, torch.stack(weighed_direction,dim=1).cuda(), [lam])
    generation_output = model.generate(
                            input_ids=torch.tensor(query['input_ids']).unsqueeze(0).cuda(),
                            attention_mask=torch.tensor(query['attention_mask']).unsqueeze(0).cuda(),
                            do_sample=True,
                            top_k=10,
                            temperature = 0.65,
                            num_return_sequences=1,
                            max_new_tokens=400,
                            eos_token_id=[tokenizer.eos_token_id]
                        )
    decoded_output = tokenizer.decode(generation_output[0])
    print("Weighted: ",decoded_output)
    remove_icv_layers(model)

# Original model (Unsafe) 

In [None]:
generation_output = model.generate(
                        input_ids=torch.tensor(query['input_ids']).unsqueeze(0).cuda(),
                        attention_mask=torch.tensor(query['attention_mask']).unsqueeze(0).cuda(),
                        max_new_tokens=100,
                        temperature = 0.65,
                        do_sample=True,
                        top_k=10,
                        num_return_sequences=1,
                        eos_token_id=[198, tokenizer.eos_token_id]
                    )
print(generation_output)
decoded_output = tokenizer.decode(generation_output[0])
print(decoded_output)

In [None]:
import gguf
def export_gguf(path: os.PathLike[str] | str, directions: list[torch.Tensor], model):
    arch = "controlvector"
    directions = [directions[i] for i in range(1, len(directions))]
    writer = gguf.GGUFWriter(path, arch)
    writer.add_string(f"{arch}.model_hint", model.config.model_type)
    writer.add_uint32(f"{arch}.layer_count", len(directions))
    for layer, _ in enumerate(directions):
        writer.add_tensor(f"direction.{layer}", directions[layer].numpy())
    writer.write_header_to_file()
    writer.write_kv_data_to_file()
    writer.write_tensors_to_file()
    writer.close()
    
export_gguf("controlvector.gguf", icv_unweighted[6], model)