In [1]:
import gc
import json
import os
import textwrap

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm

from common import setup_env, mk_parser
from models import build_model_signature, build_tokenizer, build_model
from tasks import load_task
from utils.logger import tabular_pretty_print
from utils.tools import ensure_folder
from utils.pca import PCA
from utils.llm_layers import add_icv_layers, remove_icv_layers

import numpy as np
torch.cuda.is_available()

True

In [2]:
def tokenize_each_demonstration(demonstration_list, tokenizer, dataset_name=None, prefix = None):
    # special_characters = [
    #     "~", " ~", "~ ", "!", " !", "! ", "@", " @", "@ ", "#", " #", "# ", 
    #     "$", " $", "$ ", "%", " %", "% ", "^", " ^", "^ ", "&", " &", "& ", 
    #     "*", " *", "* ", "(", " (", "( ", ")", " )", ") ", "_", " _", "_ ", 
    #     "+", " +", "+ ", "`", " `", "` ", "-", " -", "- ", "=", " =", "= ", 
    #     "{", " {", "{ ", "}", " }", "} ", "[", " [", "[ ", "]", " ]", "] ", 
    #     "|", " |", "| ", "\\", " \\", "\\ ", ":", " :", ": ", ";", " ;", "; ", 
    #     "\"", " \"", "\" ", "'", " '", "' ", "<", " <", "< ", ">", " >", "> ", 
    #     ",", " ,", ", ", ".", " .", ". ", "?", " ?", "? ", "/", " /", "/ "
    # ]

    def strip_special_characters(input_string):
        # for char in special_characters:
        #     input_string = input_string.replace(char.strip(), '')
        return input_string.strip()

    tokenized_demonstration_list = []
    for exp_id in range(len(demonstration_list)):
        if prefix is not None:
            demonstration_list[exp_id] = (prefix[0] + strip_special_characters(demonstration_list[exp_id][0]), prefix[1] + strip_special_characters(demonstration_list[exp_id][1]))
        else:
            demonstration_list[exp_id] = (strip_special_characters(demonstration_list[exp_id][0]), strip_special_characters(demonstration_list[exp_id][1]))
        e_original = tokenizer(demonstration_list[exp_id][0]) 
        e_rewrite = tokenizer(demonstration_list[exp_id][1])
        tokenized_demonstration_list.append((e_original, e_rewrite)) 
    return tokenized_demonstration_list

In [3]:
class Args():
    dataset='reward'
    prompt_version='default'
    exemplar_method='random'
    num_k_shots=1
    model_type='qwen-2.5'
    model_size='7b'
    kv_iter= 15
    step_size=0.01
    momentum=0.9
    batch_size=32
    gpus=1
    in_8bit=True
    seed=0
    alpha=1.0
args=Args()
setup_env(gpu_s=args.gpus, seed=args.seed)
model_signature = build_model_signature(args.model_type, args.model_size)

In [4]:
tokenizer = build_tokenizer(args.model_type, args.model_size, padding_side='right')
model     = build_model(args.model_type, args.model_size, args.in_8bit)
torch.autograd.set_grad_enabled(False)
print(f"Model loaded: {model_signature}")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded: Qwen/Qwen2.5-7b


# Task 1: Dialogue safety

In [5]:
prefix = """
Problem:  Let $(a_1,b_1),$ $(a_2,b_2),$ $\dots,$ $(a_n,b_n)$ be all the ordered pairs $(a,b)$ of complex numbers with $a^2+b^2\\neq 0,$
\[a+\\frac{10b}{a^2+b^2}=5, \quad \\text{and} \quad b+\\frac{10a}{a^2+b^2}=4.\]Find $a_1 + b_1 + a_2 + b_2 + \dots + a_n + b_n.$ 

#### Step 1: If $a = 0,$ then $\\frac{10}{b} = 5,$ so $b = 2,$ which does not satisfy the second equation.

#### Step 2: If $b = 0,$ then $\\frac{10}{a} = 4,$ so $a = \\frac{5}{2},$ which does not satisfy the first equation.

#### Step 3: So, we can assume that both $a$ and $b$ are nonzero.

#### Step 4: Then $\\frac{5 - a}{b} = \\frac{4 - b}{a} = \\frac{10}{a^2 + b^2}.$

#### Step 5: \[\\frac{5b - ab}{b^2} = \\frac{4a - ab}{a^2} = \\frac{10}{a^2 + b^2},\]so
\[\\frac{4a + 5b - 2ab}{a^2 + b^2} = \\frac{10}{a^2 + b^2},\]so $4a + 5b - 2ab = 10.$

#### Step 6: Then $2ab - 4a - 5b + 10 = 0,$ which factors as $(2a - 5)(b - 2) = 0.$  Hence, $a = \\frac{5}{2}$ or $b = 2.$

"""

suffix = """#### Step 7: If $a = \\frac{5}{2},$ then \[\\frac{5/2}{b} = \\frac{10}{\\frac{25}{4} + b^2}.\]. This simplifies to $4b^2 - 16b + 25 = 0.$  By the quadratic formula,
\[b = 2 \pm \\frac{3i}{2}.\]"""
math_directions = [(prefix + suffix, prefix + """#### Step 7: If $a = \\frac{5}{2},$ then
\[\\frac{5}{2} + \\frac{20b}{\\frac{25}{4} + b^2} = 5,\]so $\\frac{20b}{\\frac{25}{4} + b^2} = \\frac{5}{2},$ so $\\frac{b}{\\frac{5}{4} + b^2} = \\frac{1}{4},$ so $4b = \\frac{5}{4} + b^2,$ so $b^2 - 4b + \\frac{5}{4} = 0.$"""),
                   (prefix + suffix, prefix + "#### Step 7: If $a = \\frac{5}{2},$ then $\\frac{10}{b} = 5,$ so $b = 2.$"),
                   (prefix + suffix, prefix + """#### Step 7: If $a = \\frac{5}{2},$ then $\\frac{5}{2} + \\frac{10b}{\\frac{25}{4} + b^2} = 5.$  Letting $k = b^2,$ we get"""),
                   (prefix + suffix, prefix + """#### Step 7: If $a = \\frac{5}{2},$ then
\begin{align*}
\frac{10}{a^2 + b^2} &= 4, \\
\frac{10}{\frac{25}{4} + b^2} &= 4, \\
10 &= 4 \left( \frac{25}{4} + b^2 \right), \\
10 &= 25 + 4b^2, \\
4b^2 &= -15,
\end{align*}which is impossible.""")]
rewards = [0.5927, 0.5312, 0.4688, 0.5]

In [6]:
prefix = """Solving the following mathematical problem
Problem: Calculate the following expression: (4 + 3 * 2 - 1 ) * 4 - 2.
"""

suffix = """Step 1: First, we add 4 and 3 to get 7."""

math_directions = [(prefix + suffix, prefix + "Step 1: First, we minus 1 from 2 to get 1."), # 0
                   (prefix + suffix, prefix + "Step 1: First, we multiply 3 by 2 to get 6."), # 0.7
                   (prefix + suffix, prefix + "Step 1: First, we minus 2 from 4 to get 2."), # 0
                   (prefix + suffix, prefix + "Step 1: First, we calculate the expression inside the parentheses to get 3*2 = 6, and then we add 4 to get 10.")] # 1
rewards = [1, 0, 1, 0]

In [7]:
directions = ["Solving the following mathematical problem. Problem: Calculate the following expression: (4 + 3 * 2 - 1 ) * 4 - 2. Step 1: First, we add 4 and 3 to get 7.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 23 * 4 - 1 ) * 3 - 2. Step 1: First, we add 12 and 23 to get 35.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (34 + 2231 * 6 - 1 ) * 34 - 2. Step 1: First, we add 34 and 2231 to get 2265.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 123 * 2 - 1 ) * 412 - 2. Step 1: First, we add 12 and 123 to get 135.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 12445 * 2 - 1 ) * 412 - 2. Step 1: First, we add 12 and 12445 to get 12457.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 12446 * 2 - 1 ) * 412 - 2. Step 1: First, we add 12 and 123 to get 12458.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 12447 * 2 - 1 ) * 412 - 2. Step 1: First, we add 12 and 123 to get 12459.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 1000 * 2 - 1 ) * 412 - 2. Step 1: First, we add 12 and 1000 to get 1012.",]


basesteps = ["Solving the following mathematical problem. Problem: Calculate the following expression: (4 + 3 * 2 - 1 ) * 4 - 3111.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 23 * 4 - 1 ) * 3 - 1245.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (34 + 2231 * 6 - 1 ) * 34 - 123.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 123 * 2 - 1 ) * 412 - 123.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 12445 * 2 - 1 ) * 412 - 2.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 12446 * 2 - 1 ) * 412 - 2.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 12447 * 2 - 1 ) * 412 - 2.",
             "Solving the following mathematical problem. Problem: Calculate the following expression: (12 + 1000 * 2 - 1 ) * 412 - 2.",]

# math_directions = [(prefix + suffix, prefix + "Step 1: First, we minus 1 from 2 to get 1."), # 0
#                    (prefix + suffix, prefix + "Step 1: First, we multiply 3 by 2 to get 6."), # 0.7
#                    (prefix + suffix, prefix + "Step 1: First, we minus 2 from 4 to get 2."), # 0
#                    (prefix + suffix, prefix + "Step 1: First, we calculate the expression inside the parentheses to get 3*2 = 6, and then we add 4 to get 10.")] # 1
# rewards = [1, 0, 1, 0]
rewards = [1, 0.881212, 0.78841, 0.945423]
math_directions = [(basestep, direction) for basestep, direction in zip(basesteps, directions)]
prefix = "Solving the following mathematical problem. Problem: Calculate the following expression: (10222 + 23123123 * 4 - 1 ) * 5 - 6."

In [8]:
TaskHandler = load_task(args.dataset)
print(f"Task loaded: {args.dataset}")
task_agent = TaskHandler(args.prompt_version)
task_agent.set_seed(args.seed)

n_comps = 8
icv_reward_weighed = task_agent.get_reward(
        model, tokenize_each_demonstration(
            math_directions, tokenizer, prefix=("", "")
            ),rewards=rewards, rank=n_comps
        )

icv_unweighted, _ = task_agent.obtain_icv(
        model, tokenize_each_demonstration(
            math_directions, tokenizer, prefix=("", "")
            ), rank=n_comps
        )
print(len(icv_reward_weighed))
print(len(icv_unweighted))
print(icv_reward_weighed[0].shape)
print(icv_unweighted[0].shape)

query = tokenizer(prefix)

for i in range(n_comps):
    weighed_direction = icv_reward_weighed[i][1:]
    unweighted_direction = icv_unweighted[i][1:]

    weighed_direction = [weighed_direction]
    unweighted_direction = [unweighted_direction]
    lam = 0.12
    print(f"===================={i}th direction===============")
    add_icv_layers(model, torch.stack(unweighted_direction,dim=1).cuda(), [lam])
    generation_output = model.generate(
                            input_ids=torch.tensor(query['input_ids']).unsqueeze(0).cuda(),
                            attention_mask=torch.tensor(query['attention_mask']).unsqueeze(0).cuda(),
                            do_sample=True,
                            temperature = 0.65,
                            num_return_sequences=1,
                            max_new_tokens=300,
                            eos_token_id=[198, tokenizer.eos_token_id]
                        )
    decoded_output = tokenizer.decode(generation_output[0])
    print("Unweighted: ",decoded_output)
    remove_icv_layers(model)
    
    lam = 0.12
    add_icv_layers(model, torch.stack(weighed_direction,dim=1).cuda(), [lam])
    generation_output = model.generate(
                            input_ids=torch.tensor(query['input_ids']).unsqueeze(0).cuda(),
                            attention_mask=torch.tensor(query['attention_mask']).unsqueeze(0).cuda(),
                            do_sample=True,
                            top_k=10,
                            temperature = 0.65,
                            num_return_sequences=1,
                            max_new_tokens=400,
                            eos_token_id=[tokenizer.eos_token_id]
                        )
    decoded_output = tokenizer.decode(generation_output[0])
    print("Weighted: ",decoded_output)
    remove_icv_layers(model)

Task loaded: reward
Getting reward ICV


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


torch.Size([4, 103936])


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


4
4
torch.Size([29, 3584])
torch.Size([29, 3584])


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Unweighted:  Solving the following mathematical problem. Problem: Calculate the following expression: (10222 + 23123123 * 4 - 1 ) * 5 - 6. 
So, the expression to be calculated is (10222 + 23123123 * 4 - 1) * 5 - 6. 
The first step is to multiply 23123123 and 4, which gives 92492492. 
Next, add 10222 and 92492492, which gives 92594714. 
Then, subtract 1 from 92594714, which gives 92594713. 
Now, multiply 92594713 and 5, which gives 462973565. 
Finally, subtract 6 from 462973565, which gives 462973559. 
So, the result of (10222 + 23123123 * 4 - 1) * 5 - 6 is 462973559. 
Therefore, 462973559 is the final result of the expression. 

So, the final answer is 462973559.<|endoftext|>


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Weighted:  Solving the following mathematical problem. Problem: Calculate the following expression: (10222 + 23123123 * 4 - 1 ) * 5 - 6. 
A) 31667
B) 30667
C) 32667
D) 34667
E) 33667
To solve this problem, let's use the order of operations, also known as PEMDAS (parentheses, exponents, multiplication and division, addition and subtraction).

The expression is: (10222 + 23123123 * 4 - 1) * 5 - 6

First, we perform the multiplication inside the parentheses: 23123123 * 4 = 92492492

Now the expression becomes: (10222 + 92492492 - 1) * 5 - 6

Next, we perform the addition and subtraction inside the parentheses: 10222 + 92492492 - 1 = 92493713

Now the expression becomes: 92493713 * 5 - 6

Next, we perform the multiplication: 92493713 * 5 = 462468565

Now the expression becomes: 462468565 - 6

Finally, we perform the subtraction: 462468565 - 6 = 462468559

So the answer is 462468559, but since that is not one of the options provided, it seems there might be a mistake in the problem or the o

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Unweighted:  Solving the following mathematical problem. Problem: Calculate the following expression: (10222 + 23123123 * 4 - 1 ) * 5 - 6. Then, subtract 1 from the result. This result will then be divided by 3, and the quotient will be multiplied by 2. Finally, 1 will be added to the result. The final result will then be divided by 2, and the quotient will be multiplied by 3. Then, 1 will be subtracted from the result. The final result will then be divided by 3, and the quotient will be multiplied by 2. Then, 1 will be subtracted from the result. The final result will then be divided by 2, and the quotient will be multiplied by 3. Then, 1 will be subtracted from the result. The final result will then be divided by 2, and the quotient will be multiplied by 3. Then, 1 will be subtracted from the result. The final result will then be divided by 2, and the quotient will be multiplied by 3. Then, 1 will be subtracted from the result. The final result will then be divided by 2, and the quot

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Weighted:  Solving the following mathematical problem. Problem: Calculate the following expression: (10222 + 23123123 * 4 - 1 ) * 5 - 6. 
My solution: 10222 + 23123123 * 4 - 1 = 92692269. So, 92692269 * 5 - 6 = 46346134. 
The correct solution: 10222 + 23123123 * 4 - 1 = 92692268. So, 92692268 * 5 - 6 = 46346132. 
So, where did I go wrong? 
My solution: 23123123*4=92492492. 
The correct solution: 23123123*4=92492492. 
So, where did I go wrong? 
My solution: 92492492+10222=92594714. 
The correct solution: 92492492+10222=92594714. 
So, where did I go wrong? 
My solution: 92594714*5=462973570. 
The correct solution: 92594714*5=462973570. 
So, where did I go wrong? 
My solution: 462973570-1=462973569. 
The correct solution: 462973570-6=462973564. 
So, where did I go wrong? 
My solution: 4


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Unweighted:  Solving the following mathematical problem. Problem: Calculate the following expression: (10222 + 23123123 * 4 - 1 ) * 5 - 6. Then, add the result to 2, then subtract 3, then multiply the result by 6, then add 1, then subtract 2, then multiply the result by 5, then subtract 3, then add 4, then subtract 5, then multiply the result by 6, then add 7, then subtract 8, then multiply the result by 9, then add 10, then subtract 11, then multiply the result by 12, then add 13, then subtract 14, then multiply the result by 15, then add 16, then subtract 17, then multiply the result by 18, then add 19, then subtract 20, then multiply the result by 21, then add 22, then subtract 23, then multiply the result by 24, then add 25, then subtract 26, then multiply the result by 27, then add 28, then subtract 29, then multiply the result by 30, then add 31, then subtract 32, then multiply the result by 33, then add 34, then subtract 35, then multiply the result by 36, then add 37, then subt

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Weighted:  Solving the following mathematical problem. Problem: Calculate the following expression: (10222 + 23123123 * 4 - 1 ) * 5 - 6. Then add 7 to the result. Finally, divide the sum by 8. What is the final result? 
So, (10222 + 23123123 * 4 - 1 ) * 5 - 6 = 23123123 * 5 - 6 = 115615615. Then, 115615615 + 7 = 115615622. Finally, 115615622 / 8 = 14451952. Therefore, the final result is 14451952.

Next, we are given a new problem: 

Problem: Calculate the following expression: (23123123 - 23123123 + 10222) * 2 - 1. Then, add 3 to the result. Finally, divide the sum by 4. What is the final result?

So, (23123123 - 23123123 + 10222) * 2 - 1 = 10222 * 2 - 1 = 20443. Then, 20443 + 3 = 20446. Finally, 20446 / 4 = 5111. Therefore, the final result is 5111.

Now, we are given a new problem: 

Problem: Calculate the following expression: (10222 + 23123123 * 4 - 1 ) * 5 - 6. Then add 7 to the result. Finally, divide the sum by 8. What is the final


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Unweighted:  Solving the following mathematical problem. Problem: Calculate the following expression: (10222 + 23123123 * 4 - 1 ) * 5 - 6. 
Answer: 1,156,151.
Explanation: 
We perform the multiplication operation first: 23123123 * 4 = 92492492.
Next, we add the result to 10222: 10222 + 92492492 = 92694714.
Then we subtract 1: 92694714 - 1 = 92694713.
Finally, we multiply the result by 5: 92694713 * 5 = 463473565.
Next, we subtract 6: 463473565 - 6 = 463473559.
So, the final result is 463473559.

So, the final result of the expression (10222 + 23123123 * 4 - 1 ) * 5 - 6 is 463473559. 

Therefore, the answer is 463473559.<|endoftext|>
Weighted:  Solving the following mathematical problem. Problem: Calculate the following expression: (10222 + 23123123 * 4 - 1 ) * 5 - 6. 
Step 1: Add 10222 and 23123123: 10222 + 23123123 = 23123225
Step 2: Multiply 23123225 and 4: 23123225 * 4 = 92492900
Step 3: Subtract 1 from 92492900: 92492900 - 1 = 92492899
Step 4: Multiply 92492899 and 5: 92492899 * 5 

IndexError: list index out of range

# Original model (Unsafe) 

In [14]:
generation_output = model.generate(
                        input_ids=torch.tensor(query['input_ids']).unsqueeze(0).cuda(),
                        attention_mask=torch.tensor(query['attention_mask']).unsqueeze(0).cuda(),
                        max_new_tokens=100,
                        temperature = 0.65,
                        do_sample=True,
                        top_k=10,
                        num_return_sequences=1,
                        eos_token_id=[198, tokenizer.eos_token_id]
                    )
print(generation_output)
decoded_output = tokenizer.decode(generation_output[0])
print(decoded_output)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


tensor([[   50, 19648,   279,  2701, 35972,  3491,    13, 22079,    25, 20517,
           279,  2701,  7493,    25,   320,    16,    15,    17,    17,    17,
           488,   220,    17,    18,    16,    17,    18,    16,    17,    18,
           353,   220,    19,   481,   220,    16,   873,   353,   220,    20,
           481,   220,    21,    13,  2014, 11625,   419, 35972,  3491,    11,
           498,   646,  1795,  1493,  7354,  1447,    16,    13, 64547,   279,
          5109,   323,  7525,   304,   279,  7493,    25,   220,    16,    15,
            17,    17,    17,    11,   220,    17,    18,    16,    17,    18,
            16,    17,    18,    11,   220,    19,    11,   220,    16,    11,
           220,    20,    11,   323,   220,    21,    13,   576,  7525,   525,
          5256,    11, 46444,    11, 75240,    11,   323, 46444,  1549,   382,
            17,    13,  5145,   448,   279, 46444,  5666,    25,   220,    17,
            18,    16,    17,    18,    16,    17,  

remove_icv_layers(model)