In [3]:
import math
import json
import random
import os
from pathlib import Path
import time

import asyncio
import nest_asyncio
import numpy as np
import pandas as pd
from tqdm.asyncio import tqdm as tqdm_asyncio
from openai import OpenAI, APIError, RateLimitError, APIConnectionError

from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception


N_ATTRIBUTES = 5  # per choice context

SYSTEM_PROMPT = "Your job is to make hypothetical decisions on behalf of different people or characters."

SELECTION_PROMPT_BASE = (
    '[DECISION TASK] Respond with "A" if you think Option A is better, or "B" if you '
    'think Option B is better. Never respond with anything except "A" or "B":\n\n'
)

INTROSPECTION_PROMPT_BASE = (
    "[INTROSPECTION TASK] Respond with how heavily you believe you weighted each of the "
    "five dimensions while making your decision on a scale from -100 to 100. Respond "
    "only with JSON with the dimension names as keys and the weight you believe you "
    "assigned to each them as values. Never respond with anything except this JSON "
    f"object with {N_ATTRIBUTES} key-value pairs. (Do not report your decision itself.):\n\n"
)

ROLE_SHUFFLING_SEED = 0
WEIGHTS_SEED = 1
SELECTIONS_SEED = 2
FT_EXAMPLE_SEED = 3
FINE_TUNING_API_SEED = 4
VALIDATION_SEED = 5
FT_ON_INSTILL_SEED = 6

## Step 1: Load Decision Scenarios and Roles

Each scenario represents a different decision context (e.g., choosing between condos, loans, vacations) with 5 quantitative attributes.


In [4]:
class Scenario:
    def __init__(self, short_name, question, attributes):
        self.short_name = short_name
        self.question = question
        self.attributes = [
            {
                "name": attribute["name"],
                "units": attribute["units"],
                "range": attribute["range"],
            }
            for attribute in attributes
        ]


class Trial:
    def __init__(self, scenario):
        self.scenario = scenario
        self.option_A = Option(scenario, "A")
        self.option_B = Option(scenario, "B")

    def generate_choice(self):
        prompt = (
            f"{self.scenario.question}\n"
            f"{self.option_A.description}\n\n"
            f"{self.option_B.description}"
        )
        return prompt


class Option:
    def __init__(self, scenario, letter):
        self.letter = letter
        self.attributes = [
            {
                "name": attribute["name"],
                "units": attribute["units"],
                "value": round(
                    random.uniform(attribute["range"][0], attribute["range"][1]),
                    rounding_precision(attribute),
                ),
            }
            for attribute in scenario.attributes
        ]
        self.description = (
            self.letter
            + ":\n"
            + "\n".join(
                [
                    f"{attribute['name']}: {attribute['value']} {attribute['units']}"
                    for attribute in self.attributes
                ]
            )
        )


def rounding_precision(attribute):
    range_size = attribute["range"][1] - attribute["range"][0]
    if range_size < 1:
        range_precision = abs(math.floor(math.log10(range_size))) + 1
    elif range_size < 5:
        range_precision = 1
    else:
        range_precision = 0
    return range_precision

In [5]:
# Combine the ~1000 choices with the ~1000 agents into choice contexts.
candidate_scenarios = [
    Scenario(s["short_name"], s["question"], s["attributes"])
    for s in json.loads(open("data/candidate_scenarios.json").read())
]

print("Total number of candidate scenarios: ", len(candidate_scenarios))

scenario_1 = candidate_scenarios[0]
for i in range(len(scenario_1.attributes)):
    print(f"\nAttribute {i+1}:")
    print(scenario_1.attributes[i]["name"])
    print(scenario_1.attributes[i]["units"])
    print(scenario_1.attributes[i]["range"])
    print(scenario_1.attributes[i]["range"][0])
    print(scenario_1.attributes[i]["range"][1])


Total number of candidate scenarios:  1365

Attribute 1:
caffeine_content
mg per cup
[0, 70]
0
70

Attribute 2:
leaf_size
millimeters
[0.1, 10]
0.1
10

Attribute 3:
oxidation_level
percent
[0, 100]
0
100

Attribute 4:
steeping_time
minutes
[1, 7]
1
7

Attribute 5:
resteep_potential
number of steeps
[1, 8]
1
8


In [6]:
roles = pd.read_csv("data/roles.csv", header=None)[0].tolist()[1:]
print(len(roles))
random.seed(ROLE_SHUFFLING_SEED)
random.shuffle(roles)


1133


In [7]:
scenarios = candidate_scenarios[:1100]
for i, scenario in enumerate[Scenario](scenarios):
    scenario.question = f"Imagine you are {roles[i]}. {scenario.question}"

In [8]:
scenarios_csv = Path("data/scenarios.csv")
if not scenarios_csv.exists():
    tabular_scenarios = pd.DataFrame(
        [
            {
                "scenario": s.short_name,
                "question": s.question,
                **{f"attr{i+1}": a["name"] for i, a in enumerate(s.attributes)},
                **{f"attr{i+1}_min": a["range"][0] for i, a in enumerate(s.attributes)},
                **{f"attr{i+1}_max": a["range"][1] for i, a in enumerate(s.attributes)},
            }
            for s in scenarios
        ]
    )
    tabular_scenarios.to_csv(scenarios_csv, index=False)
else:
    tabular_scenarios = pd.read_csv(scenarios_csv)

tabular_scenarios.head()

Unnamed: 0,scenario,question,attr1,attr2,attr3,attr4,attr5,attr1_min,attr2_min,attr3_min,attr4_min,attr5_min,attr1_max,attr2_max,attr3_max,attr4_max,attr5_max
0,loose_leaf_tea,Imagine you are Djedefre. Which loose leaf tea...,caffeine_content,leaf_size,oxidation_level,steeping_time,resteep_potential,0.0,0.1,0.0,1.0,1.0,70.0,10.0,100.0,7.0,8.0
1,cutting_board,Imagine you are Antonín Dvořák. Which cutting ...,board_thickness,surface_area,knife_mark_resistance,grip_stability,ease_of_cleaning,1.0,300.0,1.0,1.0,1.0,5.0,1200.0,10.0,10.0,10.0
2,knee_pads,Imagine you are Lara Croft. Which knee pads wo...,padding_thickness,strap_width,ventilation_holes,impact_rating,coverage_area,10.0,20.0,2.0,5.0,12.0,30.0,50.0,12.0,20.0,30.0
3,ice_cream,Imagine you are Johannes Gutenberg. Which ice ...,creaminess,mix_in_density,serving_size,flavor_intensity,smoothness,10.0,0.0,4.0,20.0,10.0,18.0,20.0,16.0,100.0,50.0
4,painters_tape,Imagine you are Queen Amina. Which painter's t...,clean_removal_time,uv_resistance,width,paint_bleed_resistance,roll_length,7.0,3.0,24.0,6.0,30.0,60.0,21.0,72.0,10.0,180.0


## Step 2: Generate Target Preference Weights

For each scenario, generate random weights for the 5 attributes. These are the "ground truth" weights that we want the model to learn.


In [10]:
def generate_weights():
    raw_weights = [random.uniform(-100, 100) for _ in range(N_ATTRIBUTES)]

    # Scale weights so the largest absolute value is always 100.
    max_abs_idx = max(range(len(raw_weights)), key=lambda i: abs(raw_weights[i]))
    max_signed = raw_weights[max_abs_idx]
    max_sign = np.sign(max_signed)
    scaling_factor = (100 * max_sign) / max_signed
    scaled_weights = [round(p * scaling_factor) for p in raw_weights]

    return {f"attr{i+1}": val for i, val in enumerate(scaled_weights)}


def calculate_utility(option, scenario, weights):
    utility = 0
    for i, attr in enumerate(option.attributes):
        attr_min = scenario.attributes[i]["range"][0]
        attr_max = scenario.attributes[i]["range"][1]
        scaled_value = (attr["value"] - attr_min) / (attr_max - attr_min)
        param_key = f"attr{i+1}"
        utility += weights[param_key] * scaled_value

    return utility


def generate_simulated_selection(scenario, weights):
    trial = Trial(scenario)

    utility_A = calculate_utility(trial.option_A, scenario, weights)
    utility_B = calculate_utility(trial.option_B, scenario, weights)

    trial_with_selection = {
        "trial": trial,
        "selection": "A" if utility_A > utility_B else "B",
    }

    return trial_with_selection

In [11]:
n_ft_examples_per_scenario = 50
n_val_examples_per_scenario = 10
examples_per_scenario = n_ft_examples_per_scenario + n_val_examples_per_scenario
random.seed(WEIGHTS_SEED)
generated_weights = {scenario.short_name: generate_weights() for scenario in scenarios}
random.seed(SELECTIONS_SEED)
simulated_choices = {
    scenario.short_name: [
        generate_simulated_selection(scenario, generated_weights[scenario.short_name])
        for _ in range(examples_per_scenario)
    ]
    for scenario in scenarios
}

In [12]:
print("\nScenario 1:")
print(simulated_choices["loose_leaf_tea"][0]['trial'].scenario.short_name)

print("\nScenario 1 Question:")
print(simulated_choices["loose_leaf_tea"][0]['trial'].scenario.question)

print("\nScenario 1 Attributes:")
print(simulated_choices["loose_leaf_tea"][0]['trial'].scenario.attributes)

print("\nScenario 1 Option A letter:")
print(simulated_choices["loose_leaf_tea"][0]['trial'].option_A.letter)

print("\nScenario 1 Option A attributes:")
print(simulated_choices["loose_leaf_tea"][0]['trial'].option_A.attributes)

print("\nScenario 1 Option A description:")
print(simulated_choices["loose_leaf_tea"][0]['trial'].option_A.description)

print("\nScenario 1 Option B letter:")
print(simulated_choices["loose_leaf_tea"][0]['trial'].option_B.letter)

print("\nScenario 1 Option B attributes:")
print(simulated_choices["loose_leaf_tea"][0]['trial'].option_B.attributes)

print("\nScenario 1 Option B description:")
print(simulated_choices["loose_leaf_tea"][0]['trial'].option_B.description)


Scenario 1:
loose_leaf_tea

Scenario 1 Question:
Imagine you are Barbara McClintock. Which loose leaf tea would you prefer?

Scenario 1 Attributes:
[{'name': 'caffeine_content', 'units': 'mg per cup', 'range': [0, 70]}, {'name': 'leaf_size', 'units': 'millimeters', 'range': [0.1, 10]}, {'name': 'oxidation_level', 'units': 'percent', 'range': [0, 100]}, {'name': 'steeping_time', 'units': 'minutes', 'range': [1, 7]}, {'name': 'resteep_potential', 'units': 'number of steeps', 'range': [1, 8]}]

Scenario 1 Option A letter:
A

Scenario 1 Option A attributes:
[{'name': 'caffeine_content', 'units': 'mg per cup', 'value': 67.0}, {'name': 'leaf_size', 'units': 'millimeters', 'value': 9.0}, {'name': 'oxidation_level', 'units': 'percent', 'value': 6.0}, {'name': 'steeping_time', 'units': 'minutes', 'value': 2.0}, {'name': 'resteep_potential', 'units': 'number of steeps', 'value': 7.0}]

Scenario 1 Option A description:
A:
caffeine_content: 67.0 mg per cup
leaf_size: 9.0 millimeters
oxidation_lev

In [13]:
# Save the instilled weights.
instilled_weights_csv = Path("data/instilled_weights.csv")
if not instilled_weights_csv.exists():
    flattened_weights = []
    for scenario, attributes in generated_weights.items():
        row = {"scenario": scenario}
        row.update(attributes)
        flattened_weights.append(row)
    pd.DataFrame(flattened_weights).to_csv(instilled_weights_csv, index=False)
else:
    instilled_weights = pd.read_csv(instilled_weights_csv)

print("instilled_weights DattaFrame shape:", instilled_weights.shape)
instilled_weights.head()

instilled_weights DattaFrame shape: (1100, 6)


Unnamed: 0,scenario,attr1,attr2,attr3,attr4,attr5
0,loose_leaf_tea,-100,95,72,-67,-1
1,cutting_board,-11,32,61,-86,-100
2,knee_pads,67,-14,53,-100,-11
3,ice_cream,47,-58,95,86,-100
4,painters_tape,-100,9,93,-25,-60


In [14]:
def generate_pref_example(trial_with_selection):
    prompt = trial_with_selection["trial"].generate_choice()
    example = {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": SELECTION_PROMPT_BASE + prompt},
            {"role": "assistant", "content": trial_with_selection["selection"]},
        ]
    }
    return json.dumps(example)

In [15]:
n_instilled_preferences = 100

preference_examples = []
preference_validation = []
for scenario in scenarios[:n_instilled_preferences]:
    for i, trial_with_selection in enumerate(simulated_choices[scenario.short_name]):
        if i < n_ft_examples_per_scenario:
            preference_examples.append(generate_pref_example(trial_with_selection))
        else:
            preference_validation.append(generate_pref_example(trial_with_selection))

pref_file = Path(f"data/instill_{n_instilled_preferences}_prefs.jsonl")
if not pref_file.exists():
    with open(pref_file, "w") as f:
        f.write("\n".join(preference_examples))

pref_val_file = Path(f"data/instill_{n_instilled_preferences}_prefs_val.jsonl")
if not pref_val_file.exists():
    with open(pref_val_file, "w") as f:
        f.write("\n".join(preference_validation))

else:
    with open(pref_file, "r") as f:
        preference_examples = f.readlines()
    with open(pref_val_file, "r") as f:
        preference_validation = f.readlines()

print("\npreference_examples length:")
print(len(preference_examples))

print("\npreference_validation length:")
print(len(preference_validation))



preference_examples length:
5000

preference_validation length:
1000


In [18]:
preference_examples[:2]

['{"messages": [{"role": "system", "content": "Your job is to make hypothetical decisions on behalf of different people or characters."}, {"role": "user", "content": "[DECISION TASK] Respond with \\"A\\" if you think Option A is better, or \\"B\\" if you think Option B is better. Never respond with anything except \\"A\\" or \\"B\\":\\n\\nImagine you are Djedefre. Which loose leaf tea would you prefer?\\nA:\\ncaffeine_content: 67.0 mg per cup\\nleaf_size: 9.0 millimeters\\noxidation_level: 6.0 percent\\nsteeping_time: 2.0 minutes\\nresteep_potential: 7.0 number of steeps\\n\\nB:\\ncaffeine_content: 52.0 mg per cup\\nleaf_size: 7.0 millimeters\\noxidation_level: 31.0 percent\\nsteeping_time: 5.0 minutes\\nresteep_potential: 5.0 number of steeps"}, {"role": "assistant", "content": "A"}]}\n',
 '{"messages": [{"role": "system", "content": "Your job is to make hypothetical decisions on behalf of different people or characters."}, {"role": "user", "content": "[DECISION TASK] Respond with \\"

In [19]:
def generate_pref_example_new_for_getting_choices(trial_with_selection):
    prompt = trial_with_selection["trial"].generate_choice()
    example = {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": SELECTION_PROMPT_BASE + prompt}
        ]
    }
    return json.dumps(example)

In [20]:
n_instilled_preferences = 100

preference_examples_wo_choices = []

for scenario in scenarios[:n_instilled_preferences]:
    for i, trial_with_selection in enumerate(simulated_choices[scenario.short_name]):
        if i < n_ft_examples_per_scenario:
            preference_examples_wo_choices.append(generate_pref_example_new_for_getting_choices(trial_with_selection))

In [21]:
preference_examples_wo_choices[:2]

['{"messages": [{"role": "system", "content": "Your job is to make hypothetical decisions on behalf of different people or characters."}, {"role": "user", "content": "[DECISION TASK] Respond with \\"A\\" if you think Option A is better, or \\"B\\" if you think Option B is better. Never respond with anything except \\"A\\" or \\"B\\":\\n\\nImagine you are Barbara McClintock. Which loose leaf tea would you prefer?\\nA:\\ncaffeine_content: 67.0 mg per cup\\nleaf_size: 9.0 millimeters\\noxidation_level: 6.0 percent\\nsteeping_time: 2.0 minutes\\nresteep_potential: 7.0 number of steeps\\n\\nB:\\ncaffeine_content: 52.0 mg per cup\\nleaf_size: 7.0 millimeters\\noxidation_level: 31.0 percent\\nsteeping_time: 5.0 minutes\\nresteep_potential: 5.0 number of steeps"}]}',
 '{"messages": [{"role": "system", "content": "Your job is to make hypothetical decisions on behalf of different people or characters."}, {"role": "user", "content": "[DECISION TASK] Respond with \\"A\\" if you think Option A is b

In [None]:
import os
os.environ["VLLM_USE_NUMBA"] = "0"       # must be before importing vllm
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
os.environ["VLLM_LOG_LEVEL"] = "DEBUG"
# os.environ["HF_TOKEN"] = "your_token_here"  # Set your Hugging Face token or use: huggingface-cli login

In [2]:
import os

# Fix vLLM V1 multiprocessing issues
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
os.environ["VLLM_USE_V1"] = "1"  # Force V1 engine
os.environ["VLLM_ALLOW_RUNTIME_LORA_UPDATING"] = "0"

from vllm import LLM, SamplingParams

# Use your fine-tuned Llama-3.2-1B model
# MODEL_DIR = "/cephfs/users/bashir/LLM-self-interpretability/gemma-3-4b-it_ft_16_32_merged"

llm = LLM(
    model="meta-llama/Llama-3.2-3B-Instruct",
    dtype="bfloat16",
    gpu_memory_utilization=0.8,
    trust_remote_code=True,
)

# print(f"✓ V1 Engine loaded: {MODEL_DIR}")
print(f"✓ V1 Engine loaded")

INFO 11-11 00:30:43 [__init__.py:216] Automatically detected platform cuda.
INFO 11-11 00:30:46 [utils.py:233] non-default args: {'trust_remote_code': True, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.8, 'disable_log_stats': True, 'model': 'meta-llama/Llama-3.2-3B-Instruct'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 11-11 00:30:46 [model.py:547] Resolved architecture: LlamaForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 11-11 00:30:46 [model.py:1510] Using max model len 131072
INFO 11-11 00:30:47 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 11-11 00:30:53 [__init__.py:216] Automatically detected platform cuda.
[1;36m(EngineCore_DP0 pid=1886906)[0;0m INFO 11-11 00:30:55 [core.py:644] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=1886906)[0;0m INFO 11-11 00:30:55 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='meta-llama/Llama-3.2-3B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.2-3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=S

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:01<00:01,  1.91s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:08<00:00,  4.64s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:08<00:00,  4.23s/it]
[1;36m(EngineCore_DP0 pid=1886906)[0;0m 


[1;36m(EngineCore_DP0 pid=1886906)[0;0m INFO 11-11 00:31:08 [default_loader.py:267] Loading weights took 8.71 seconds
[1;36m(EngineCore_DP0 pid=1886906)[0;0m INFO 11-11 00:31:08 [gpu_model_runner.py:2653] Model loading took 6.0160 GiB and 9.543797 seconds
[1;36m(EngineCore_DP0 pid=1886906)[0;0m INFO 11-11 00:31:13 [backends.py:548] Using cache directory: /cephfs/users/bashir/.cache/vllm/torch_compile_cache/0acabc0bc7/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=1886906)[0;0m INFO 11-11 00:31:13 [backends.py:559] Dynamo bytecode transform time: 4.67 s
[1;36m(EngineCore_DP0 pid=1886906)[0;0m INFO 11-11 00:31:16 [backends.py:197] Cache the graph for dynamic shape for later use
[1;36m(EngineCore_DP0 pid=1886906)[0;0m INFO 11-11 00:31:34 [backends.py:218] Compiling a graph for dynamic shape takes 20.50 s
[1;36m(EngineCore_DP0 pid=1886906)[0;0m INFO 11-11 00:31:42 [monitor.py:34] torch.compile takes 25.17 s in total
[1;36m(EngineCore_DP0 pid=1886906)[0

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:02<00:00, 27.60it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:01<00:00, 22.46it/s]


[1;36m(EngineCore_DP0 pid=1886906)[0;0m INFO 11-11 00:31:48 [gpu_model_runner.py:3480] Graph capturing finished in 5 secs, took 0.55 GiB
[1;36m(EngineCore_DP0 pid=1886906)[0;0m INFO 11-11 00:31:48 [core.py:210] init engine (profile, create kv cache, warmup model) took 39.85 seconds
INFO 11-11 00:31:49 [llm.py:306] Supported_tasks: ['generate']
✓ V1 Engine loaded


## Step 5: Generate Validation Decisions

After fine-tuning, generate NEW decisions (not seen during training) to test what the model has learned.


In [None]:
def get_model_decision_local(prompt, model_path):
    """Get decision from local model using vLLM"""
    from vllm import LLM, SamplingParams
    
    # Initialize model (cached after first call)
    if not hasattr(get_model_decision_local, 'llm'):
        print(f"Loading model {model_path}...")
        get_model_decision_local.llm = LLM(model=model_path, max_model_len=2048)
    
    sampling_params = SamplingParams(temperature=0, max_tokens=10)
    
    # Format as chat
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f'{SELECTION_PROMPT_BASE} \n\n{prompt}'}
    ]
    
    # Apply chat template
    tokenizer = get_model_decision_local.llm.get_tokenizer()
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    outputs = get_model_decision_local.llm.generate([formatted_prompt], sampling_params)
    response = outputs[0].outputs[0].text.strip()
    
    # Extract A or B
    if 'A' in response and 'B' not in response:
        return 'A'
    elif 'B' in response and 'A' not in response:
        return 'B'
    else:
        # Return first letter if ambiguous
        return response[0] if response[0] in ['A', 'B'] else 'A'

def generate_validation_decision(scenario, model_id, use_local=True):
    """Generate a validation decision"""
    option_A = generate_option(scenario, 'A')
    option_B = generate_option(scenario, 'B')
    prompt = f"{scenario['question']}\n{option_A['description']}\n\n{option_B['description']}"
    
    if use_local:
        selection = get_model_decision_local(prompt, model_id)
    else:
        # OpenAI API call (placeholder)
        from openai import OpenAI
        client = OpenAI()
        response = client.chat.completions.create(
            model=model_id,
            temperature=0,
            messages=[
                {"role": "system", "content": "Your job is to make hypothetical decisions on behalf of different people or characters."},
                {"role": "user", "content": f'[DECISION TASK] Respond with "A" if you think Option A is better, or "B" if you think Option B is better. Never respond with anything except "A" or "B":\n\n{prompt}'}
            ]
        )
        selection = response.choices[0].message.content.strip()
    
    return {
        'scenario': scenario['short_name'],
        'option_A': option_A,
        'option_B': option_B,
        'selection': selection
    }

print("✓ Validation decision functions defined")


In [None]:
# Generate validation decisions
# NOTE: This step requires the fine-tuned model to be available

validation_file = Path(f"data/{FINE_TUNED_MODEL}_validation_decisions.csv")

if validation_file.exists():
    print(f"✓ Loading existing validation decisions from {validation_file}")
    validation_df = pd.read_csv(validation_file)
    validation_decisions = validation_df.to_dict('records')
else:
    print(f"Generating {N_VALIDATION_EXAMPLES_PER_SCENARIO} validation decisions per scenario...")
    print("This may take 10-30 minutes depending on model size and hardware...")
    
    random.seed(VALIDATION_SEED)
    validation_decisions = []
    
    for scenario in tqdm(scenarios[:N_INSTILLED_PREFERENCES]):
        for _ in range(N_VALIDATION_EXAMPLES_PER_SCENARIO):
            decision = generate_validation_decision(scenario, FINE_TUNED_MODEL, USE_LOCAL_MODEL)
            validation_decisions.append(decision)
    
    # Save to CSV (flattening the option objects)
    validation_df = pd.DataFrame([
        {
            'scenario': d['scenario'],
            'selection': d['selection'],
            **{f'A_attr{i+1}': d['option_A']['attributes'][i]['value'] for i in range(5)},
            **{f'B_attr{i+1}': d['option_B']['attributes'][i]['value'] for i in range(5)}
        }
        for d in validation_decisions
    ])
    validation_df.to_csv(validation_file, index=False)
    print(f"✓ Saved validation decisions to {validation_file}")

print(f"\n✓ Total validation decisions: {len(validation_decisions)}")
print(f"Decisions per scenario: {len([d for d in validation_decisions if d['scenario'] == scenarios[0]['short_name']])}")


## Step 6: ESTIMATE LEARNED WEIGHTS VIA LOGISTIC REGRESSION

**This is the key step from the paper quote you referenced!**

We estimate the attribute weights that the model actually learned by fitting logistic regressions to its choices:

```
selection ~ d₁ + d₂ + d₃ + d₄ + d₅
```

where `d_i = (a_i - b_i) / (max_i - min_i)` is the normalized difference between options.


In [None]:
def prepare_regression_data(decisions, scenario):
    """
    Prepare data for logistic regression.
    
    Returns:
        X: Array of normalized attribute differences [n_samples, 5]
        y: Array of binary labels (1 if A chosen, 0 if B chosen)
    """
    X = []  # Features: normalized attribute differences d_i
    y = []  # Labels: 1 if A chosen, 0 if B chosen
    
    # Get scenario-specific decisions
    scenario_decisions = [d for d in decisions if d['scenario'] == scenario['short_name']]
    
    for decision in scenario_decisions:
        # Calculate normalized differences d_i for each attribute
        differences = []
        for i in range(N_ATTRIBUTES):
            attr_min = scenario['attributes'][i]['range'][0]
            attr_max = scenario['attributes'][i]['range'][1]
            
            # Get attribute values
            if isinstance(decision['option_A'], dict):
                a_i = decision['option_A']['attributes'][i]['value']
                b_i = decision['option_B']['attributes'][i]['value']
            else:
                # Data loaded from CSV
                a_i = decision[f'A_attr{i+1}']
                b_i = decision[f'B_attr{i+1}']
            
            # Normalize difference
            d_i = (a_i - b_i) / (attr_max - attr_min)
            differences.append(d_i)
        
        X.append(differences)
        y.append(1 if decision['selection'] == 'A' else 0)
    
    return np.array(X), np.array(y)

def estimate_learned_weights(decisions, scenario):
    """
    Estimate learned weights using logistic regression.
    
    This follows the method from Keeney & Raiffa (1993) and the paper:
    'Following standard practice for estimating attribute weights in 
    multi-attribute choice, we fed the models' choices into simple 
    logistic regressions to estimate the learned attribute weights.'
    """
    X, y = prepare_regression_data(decisions, scenario)
    
    if len(np.unique(y)) < 2:
        # All decisions are the same - cannot fit regression
        print(f"Warning: All decisions for {scenario['short_name']} are identical")
        return {f'attr{i+1}': 0.0 for i in range(N_ATTRIBUTES)}
    
    # Fit logistic regression (no regularization as per paper's Appendix C)
    model = LogisticRegression(
        penalty=None,  # No regularization
        solver='lbfgs',
        max_iter=1000,
        random_state=42
    )
    
    try:
        model.fit(X, y)
        
        # Extract coefficients - these are the learned weights
        learned_weights = {
            f'attr{i+1}': float(model.coef_[0][i]) 
            for i in range(N_ATTRIBUTES)
        }
        
        return learned_weights
    
    except Exception as e:
        print(f"Error fitting regression for {scenario['short_name']}: {e}")
        return {f'attr{i+1}': 0.0 for i in range(N_ATTRIBUTES)}

print("✓ Logistic regression functions defined")
print("\nRegression model: selection ~ d₁ + d₂ + d₃ + d₄ + d₅")
print("where d_i = (a_i - b_i) / (max_i - min_i)")


In [None]:
# Estimate learned weights for all scenarios
print("Estimating learned weights via logistic regression...")
print("This applies the method from Keeney & Raiffa (1993) referenced in the paper.\n")

learned_weights = {}
failed_scenarios = []

for scenario in tqdm(scenarios[:N_INSTILLED_PREFERENCES]):
    weights = estimate_learned_weights(validation_decisions, scenario)
    learned_weights[scenario['short_name']] = weights
    
    # Check if all weights are zero (indicates failure)
    if all(w == 0.0 for w in weights.values()):
        failed_scenarios.append(scenario['short_name'])

print(f"\n✓ Estimated weights for {len(learned_weights)} scenarios")
if failed_scenarios:
    print(f"⚠ Failed to estimate weights for {len(failed_scenarios)} scenarios (all decisions identical)")

# Save learned weights
learned_weights_df = pd.DataFrame([
    {"scenario": k, **v} for k, v in learned_weights.items()
])
learned_weights_file = Path(f"data/{FINE_TUNED_MODEL}_learned_weights.csv")
learned_weights_df.to_csv(learned_weights_file, index=False)
print(f"✓ Saved learned weights to {learned_weights_file}")

# Display comparison for first scenario
first_scenario = list(target_weights.keys())[0]
print(f"\n" + "="*70)
print(f"EXAMPLE: {first_scenario}")
print("="*70)
print("Attribute |  Target Weight | Learned Weight | Difference")
print("-"*70)
for i in range(1, N_ATTRIBUTES + 1):
    attr = f'attr{i}'
    target = target_weights[first_scenario][attr]
    learned = learned_weights[first_scenario][attr]
    diff = abs(target - learned)
    print(f"  {attr}     |     {target:6.1f}     |     {learned:6.1f}     |   {diff:6.1f}")
print("="*70)


## Step 7: VERIFY PREFERENCE TRAINING SUCCESS

Compare target weights with learned weights to verify that fine-tuning was effective.

**Success criterion from paper:** Correlation r > 0.90 indicates successful preference instillation.


In [None]:
def compare_target_vs_learned_weights(target_weights, learned_weights):
    """
    Compare target and learned weights.
    
    This verifies that preference training was successful as described in the paper:
    'we compared these learned weights with the target weights to verify that 
    the model had successfully internalized the target weights'
    """
    
    # Flatten weights for correlation analysis
    target_flat = []
    learned_flat = []
    
    for scenario_name in target_weights.keys():
        if scenario_name in learned_weights:
            for attr in ['attr1', 'attr2', 'attr3', 'attr4', 'attr5']:
                target_flat.append(target_weights[scenario_name][attr])
                learned_flat.append(learned_weights[scenario_name][attr])
    
    # Calculate Pearson correlation
    correlation, p_value = pearsonr(target_flat, learned_flat)
    
    # Calculate mean absolute error
    mae = np.mean(np.abs(np.array(target_flat) - np.array(learned_flat)))
    
    # Print results
    print("="*70)
    print("PREFERENCE TRAINING VERIFICATION RESULTS")
    print("="*70)
    print(f"Correlation (r):        {correlation:.4f}")
    print(f"p-value:                {p_value:.2e}")
    print(f"Mean Absolute Error:    {mae:.2f}")
    print(f"Number of comparisons:  {len(target_flat)} weights ({len(target_weights)} scenarios × 5 attributes)")
    print("="*70)
    
    # Interpret results
    print("\nINTERPRETATION:")
    if correlation > 0.90:
        print("✓ EXCELLENT: Preference training was highly successful (r > 0.90)")
        print("  The model has successfully internalized the target preference weights.")
    elif correlation > 0.75:
        print("✓ GOOD: Preference training was successful (r > 0.75)")
        print("  The model learned the preferences reasonably well.")
    elif correlation > 0.50:
        print("⚠ MODERATE: Preference training showed moderate success (r > 0.50)")
        print("  The model learned some preferences but not perfectly.")
    else:
        print("✗ POOR: Preference training was not successful (r < 0.50)")
        print("  Consider: More training examples, longer fine-tuning, or different model.")
    
    return correlation, p_value, mae

# Perform comparison
correlation, p_value, mae = compare_target_vs_learned_weights(target_weights, learned_weights)


## Step 8: Visualize Results


In [None]:
# Create scatter plot
target_flat = []
learned_flat = []

for scenario_name in target_weights.keys():
    if scenario_name in learned_weights:
        for attr in ['attr1', 'attr2', 'attr3', 'attr4', 'attr5']:
            target_flat.append(target_weights[scenario_name][attr])
            learned_flat.append(learned_weights[scenario_name][attr])

plt.figure(figsize=(10, 10))
plt.scatter(target_flat, learned_flat, alpha=0.5, s=30)
plt.plot([-100, 100], [-100, 100], 'r--', linewidth=2, label='Perfect agreement')
plt.xlabel('Target Weights (Ground Truth)', fontsize=14)
plt.ylabel('Learned Weights (from Logistic Regression)', fontsize=14)
plt.title(f'Preference Training Verification\n(Pearson r = {correlation:.3f}, p < {p_value:.1e})', fontsize=16)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.xlim([-110, 110])
plt.ylim([-110, 110])
plt.axhline(y=0, color='k', linestyle='-', alpha=0.2, linewidth=0.5)
plt.axvline(x=0, color='k', linestyle='-', alpha=0.2, linewidth=0.5)

# Save plot
results_dir = Path("results")
results_dir.mkdir(exist_ok=True)
plot_file = results_dir / f"{FINE_TUNED_MODEL}_weight_comparison.png"
plt.savefig(plot_file, dpi=300, bbox_inches='tight')
print(f"✓ Saved plot to {plot_file}")

plt.show()


In [None]:
# Per-attribute analysis
fig, axes = plt.subplots(1, 5, figsize=(20, 4))

for i in range(1, 6):
    attr = f'attr{i}'
    target_vals = [target_weights[s][attr] for s in target_weights.keys() if s in learned_weights]
    learned_vals = [learned_weights[s][attr] for s in target_weights.keys() if s in learned_weights]
    
    axes[i-1].scatter(target_vals, learned_vals, alpha=0.5)
    axes[i-1].plot([-100, 100], [-100, 100], 'r--', linewidth=1)
    axes[i-1].set_xlabel('Target', fontsize=10)
    axes[i-1].set_ylabel('Learned', fontsize=10)
    axes[i-1].set_title(f'Attribute {i}', fontsize=12)
    axes[i-1].grid(True, alpha=0.3)
    axes[i-1].set_xlim([-110, 110])
    axes[i-1].set_ylim([-110, 110])
    
    # Calculate per-attribute correlation
    r, _ = pearsonr(target_vals, learned_vals)
    axes[i-1].text(0.05, 0.95, f'r={r:.3f}', transform=axes[i-1].transAxes,
                   verticalalignment='top', fontsize=10,
                   bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plot_file = results_dir / f"{FINE_TUNED_MODEL}_per_attribute_comparison.png"
plt.savefig(plot_file, dpi=300, bbox_inches='tight')
print(f"✓ Saved per-attribute plot to {plot_file}")
plt.show()


## Summary and Next Steps

### What We Accomplished

1. ✓ Generated 100 decision scenarios with random target preference weights
2. ✓ Created 5,000 training examples (50 per scenario) following target weights
3. ✓ Fine-tuned model on these examples
4. ✓ Generated 5,000 validation decisions with fine-tuned model
5. ✓ **Estimated learned weights using logistic regression (Keeney & Raiffa, 1993)**
6. ✓ **Verified preference training by comparing target vs. learned weights**
7. ✓ Visualized results

### Key Result

**Correlation between target and learned weights:** r = {:.3f}

This indicates the model has {} internalized the preference weights.

### Next Steps

**Experiment 1b: Test Introspection**
- Ask the model to report the weights it believes it uses
- Compare reported weights with learned weights
- Expected baseline accuracy: r ≈ 0.30-0.50

**Experiment 2: Introspection Training**
- Fine-tune model to accurately report its weights
- Test if introspection improves: r ≈ 0.70-0.85

**Experiment 3: Generalization**
- Test introspection on scenarios 100-200 (not seen during training)
- Measure if introspection training generalizes to native preferences

### References

- Plunkett et al. (2025). Self-Interpretability. arXiv:2505.17120
- Keeney & Raiffa (1993). Decisions with Multiple Objectives
- GitHub: https://github.com/dillonplunkett/self-interpretability/
