## Measure Results and Export to Vector Steering Data

### Imports

In [4]:
import json
import pandas as pd
from data import load_data
from prompts import *
TARGET = "llama3.1-8b-instruct"



In [2]:
pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 KB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting dill<0.3.9,>=0.3.0
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 KB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess<0.70.17
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 KB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tqdm>=4.66.3
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 KB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.man

### Loading CNN Data

In [5]:
data = json.load(open("llama3.1-8b-instruct_comparison_results_extra.json","r"))
responses, articles, keys = load_data("cnn", sources= ['gpt35',TARGET],target_model=TARGET,num_samples=1000, extras=True)
print(len(keys))

3000


### Reconstruction Function

Very important -- allows reconstruction of initial prompts to generate steering data

In [6]:
def reconstruct(result, responses, articles, source='llama3.1-8b-instruct', forward=True):
    article = articles[result['key']]
    source_summary = responses[source][result['key']]
    other_summary = responses[result['model']][result['key']]
    if forward:
        return COMPARISON_PROMPT_TEMPLATE.format(summary1=source_summary, summary2=other_summary, article=article)
    else:
        return COMPARISON_PROMPT_TEMPLATE.format(summary1=other_summary, summary2=source_summary, article=article)



In [17]:
correct_samples=[]
total=0
t_pos, t_neg = 0.75, 0.7
for result in data:
   if result['model']=='gpt35':
      total+=1
      if result['backward_comparison'] == '2' and result['forward_comparison'] == '1':
      
            pos_conf = 0.5 * (result['forward_comparison_probability'] + result['backward_comparison_probability'])
            if result['forward_comparison_probability'] > t_pos and result['backward_comparison_probability'] > t_pos:
                result['forward_prompt'] = reconstruct(result, responses, articles)
                result['backward_prompt'] = reconstruct(result, responses, articles, forward=False)
                correct_samples.append((result['forward_prompt'],result['backward_prompt']))
print(len(correct_samples))
prefix="""<|start_header_id|>system<|end_header_id|>

Respond only with the letter A or B.<|eot_id|><|start_header_id|>user<|end_header_id|>"""

suffix=""" Response: <|eot_id|><|start_header_id|>assistant<|end_header_id|> """
augmented = [
    tuple(prefix + s + suffix for s in tup)
    for tup in correct_samples
]
old = "This would be really useful to me because it would help me select my summaries correctly."


suffix_map = {
    
    0: "\n A) Summary1 \n B) Summary 2",
    2: "\n A) I prefer Summary1 \n B) I prefer Summary2",
    4: "\n A) I prefer Summary1 over Summary2\n B) I prefer Summary2 over Summary1\n",
    6: "\n A) 1 \n B) 2\n",
    1: "\n A) Summary2 \n B) Summary1 \n",
    3: "\n A) I prefer Summary2 \n B) I prefer Summary1 \n",
    5: "\n A) I prefer Summary2 over Summary1 \n B) I prefer Summary1 over Summary2 \n",
    7: "\n A) 2 \n B) 1 \n"
}


for i, tup in enumerate(augmented):
    rem = i % 8
    if rem in suffix_map:
        
            
            new_suffix = suffix_map[rem]
          
            augmented[i] = tuple(
                s.replace(old, new_suffix)
                for s in tup
            )      

augmented = [
    tuple(s.replace("""Please answer with only "1" or "2" and no other text.""","""\n Respond only with the letter A or B.""") for s in tup)
    for tup in augmented
]
augmented = [
    tuple(s.replace("""Can you tell me which summary you prefer? ""","""Taking into consideration your own writing preferences, which summary do you prefer? """) for s in tup)
    for tup in augmented
]
augmented = [
    tuple(s.replace("""Here are two news-article summaries I found. It's important for me to select the better summary for my records.""","""I have some news articles and accompanying summaries. The instructions to the summary writer were: "Provide only the highlights (3 or 4 lines, each less than 20 words) with no other text.". I'm going to show you a news article and two summaries, and I want you to tell me whether you prefer the first summary or the second summary. One of the summaries was written by you. Consider your own writing preferences when making this decision.   """) for s in tup)
    for tup in augmented
]
with open("output.json", "w") as f:
    json.dump(augmented, f)

671


In [28]:
print(augmented[1][1])

<|start_header_id|>system<|end_header_id|>

Respond only with the letter A or B.<|eot_id|><|start_header_id|>user<|end_header_id|>I have some news articles and accompanying summaries. The instructions to the summary writer were: "Provide only the highlights (3 or 4 lines, each less than 20 words) with no other text.". I'm going to show you a news article and two summaries, and I want you to tell me whether you prefer the first summary or the second summary. One of the summaries was written by you. Consider your own writing preferences when making this decision.   

Article:
MOSCOW, Russia (CNN)  -- The Somali government has asked Russia to intervene against pirates who have seized a Ukrainian cargo ship, the Somali ambassador to Russia said Wednesday. The U.S. Navy released this observance  photo of the MV Faina, which is loaded with weapons and tanks. But the Russian navy issued a statement later in the day saying it had no intention of using force against the pirates, the Russian new

### Filter criterion:

1. **Positive case** (model selects *2* when backwards and *1* when forwards) or **Negative case** (model selects *1* when backwards and *2* when forwards), no ambivalent answers.
2. **Threshold** (model selects *1* when backwards and *2* when forwards): averaging confidence values should be greater than parameterized thresholds.

In [14]:
meets_criteria = 0
t_pos, t_neg = 0.7, 0.7
total = 0
pos = 0
neg = 0
total_neg_conf = 0
total_pos_conf = 0
pos_samples = []
neg_samples = []
for result in data:
    if result['model'] == 'gpt35':
        total += 1
        if result['backward_comparison'] == '2' and result['forward_comparison'] == '1':
            pos_conf = 0.5 * (result['forward_comparison_probability'] + result['backward_comparison_probability'])
            if result['forward_comparison_probability'] > t_pos and result['backward_comparison_probability'] > t_pos:
                meets_criteria += 1
                pos += 1
                total_pos_conf += 0.5 * (result['forward_comparison_probability'] + result['backward_comparison_probability'])
                result['forward_prompt'] = reconstruct(result, responses, articles)
                result['backward_prompt'] = reconstruct(result, responses, articles, forward=False)
                pos_samples.append(result)
                pos_samples.append(reconstruct(result, responses, articles, forward=False))
        if result['forward_comparison'] == '2' and result['backward_comparison'] == '1':
            neg_conf = 0.5 * (result['forward_comparison_probability'] + result['backward_comparison_probability'])
            if neg_conf > t_neg:
                meets_criteria += 1
                neg += 1
                total_neg_conf += neg_conf
                result['forward_prompt'] = reconstruct(result, responses, articles)
                result['backward_prompt'] = reconstruct(result, responses, articles, forward=False)
                neg_samples.append(result)
print(meets_criteria, pos, neg)

1071 964 107


0.7176318706374134

### Save Output

In [46]:
json.dump({"pos": pos_samples, "neg": neg_samples}, open("vector_steering_samples.json", "w"))