## Measure Results and Export to Vector Steering Data

### Imports

In [1]:
import json
import pandas as pd
from data import load_data
from prompts import *
TARGET = "llama3.1-8b-instruct"



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### Loading CNN Data

In [2]:
import json

negatives = []
with open("xsum_llama3.1-8b-instruct_agreement_examples.jsonl", "r") as f:
    for line in f:
        negatives.append(json.loads(line))
positives=[]
with open("xsum_llama3.1-8b-instruct_bias_examples.jsonl", "r") as f:
    for line in f:
        positives.append(json.loads(line))
responses, articles, keys = load_data("xsum", sources= ['gpt35',TARGET],target_model=TARGET,num_samples=1000, extras=False)
print(len(keys))

1000


### Reconstruction Function

Very important -- allows reconstruction of initial prompts to generate steering data

In [3]:
def reconstruct(result, responses, articles, source='llama3.1-8b-instruct', forward=True):
    article = articles[result['id']]
    source_summary = responses[source][result['id']]
    other_summary = responses['gpt35'][result['id']]
    if forward:
        return COMPARISON_PROMPT_TEMPLATE.format(summary1=source_summary, summary2=other_summary, article=article)
    



In [None]:
from openai import OpenAI
correct_samples=[]
total=0
t_pos, t_neg = 0.75, 0.7
for i,result in enumerate(positives):
   
                result['forward_prompt'] = reconstruct(result, responses, articles)
                result['backward_prompt'] = reconstruct(negatives[i], responses, articles, forward=True)
                correct_samples.append((result['forward_prompt'],result['backward_prompt']))
#print(len(correct_samples))
openai_api_key = ""
openai_api_base = "https://api.lambda.ai/v1"

# Initialize the OpenAI client
openai_client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
    timeout=120
)

for i in range(87):
    
    history = [
        {"role": "system", "content": COMPARISON_SYSTEM_PROMPT},
        {
            "role": "user","content": correct_samples[i][1]}, # [i][0] is a postive example and [i][1] is a negative example
    ]

    response = openai_client.chat.completions.create(
        model="llama3.1-8b-instruct",
        messages=history,
        max_tokens=4,
        temperature=0,

    )

    print(response.choices[0].message.content)
prefix="""<|start_header_id|>system<|end_header_id|>

Respond only with the letter A or B.<|eot_id|><|start_header_id|>user<|end_header_id|>"""

suffix=""" Response: <|eot_id|><|start_header_id|>assistant<|end_header_id|> """
augmented = [
    tuple(prefix + s + suffix for s in tup)
    for tup in correct_samples
]


2
2
2
2
2
2
1
1
2
1
2
2
2
2
2
2
2
2
2
2
2
2
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
1
2
2
2
2
2
2
2
2
2
2
1
2
2
2
2
2
2
2
2
2
2
2
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
1
2


In [28]:
print(augmented[1][1])

<|start_header_id|>system<|end_header_id|>

Respond only with the letter A or B.<|eot_id|><|start_header_id|>user<|end_header_id|>I have some news articles and accompanying summaries. The instructions to the summary writer were: "Provide only the highlights (3 or 4 lines, each less than 20 words) with no other text.". I'm going to show you a news article and two summaries, and I want you to tell me whether you prefer the first summary or the second summary. One of the summaries was written by you. Consider your own writing preferences when making this decision.   

Article:
MOSCOW, Russia (CNN)  -- The Somali government has asked Russia to intervene against pirates who have seized a Ukrainian cargo ship, the Somali ambassador to Russia said Wednesday. The U.S. Navy released this observance  photo of the MV Faina, which is loaded with weapons and tanks. But the Russian navy issued a statement later in the day saying it had no intention of using force against the pirates, the Russian new

### Filter criterion:

1. **Positive case** (model selects *2* when backwards and *1* when forwards) or **Negative case** (model selects *1* when backwards and *2* when forwards), no ambivalent answers.
2. **Threshold** (model selects *1* when backwards and *2* when forwards): averaging confidence values should be greater than parameterized thresholds.

In [14]:
meets_criteria = 0
t_pos, t_neg = 0.7, 0.7
total = 0
pos = 0
neg = 0
total_neg_conf = 0
total_pos_conf = 0
pos_samples = []
neg_samples = []
for result in data:
    if result['model'] == 'gpt35':
        total += 1
        if result['backward_comparison'] == '2' and result['forward_comparison'] == '1':
            pos_conf = 0.5 * (result['forward_comparison_probability'] + result['backward_comparison_probability'])
            if result['forward_comparison_probability'] > t_pos and result['backward_comparison_probability'] > t_pos:
                meets_criteria += 1
                pos += 1
                total_pos_conf += 0.5 * (result['forward_comparison_probability'] + result['backward_comparison_probability'])
                result['forward_prompt'] = reconstruct(result, responses, articles)
                result['backward_prompt'] = reconstruct(result, responses, articles, forward=False)
                pos_samples.append(result)
                pos_samples.append(reconstruct(result, responses, articles, forward=False))
        if result['forward_comparison'] == '2' and result['backward_comparison'] == '1':
            neg_conf = 0.5 * (result['forward_comparison_probability'] + result['backward_comparison_probability'])
            if neg_conf > t_neg:
                meets_criteria += 1
                neg += 1
                total_neg_conf += neg_conf
                result['forward_prompt'] = reconstruct(result, responses, articles)
                result['backward_prompt'] = reconstruct(result, responses, articles, forward=False)
                neg_samples.append(result)
print(meets_criteria, pos, neg)

1071 964 107


0.7176318706374134

### Save Output

In [46]:
json.dump({"pos": pos_samples, "neg": neg_samples}, open("vector_steering_samples.json", "w"))