In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path

notebook_path = Path().absolute()
sys.path.append(str(notebook_path.parent))

In [3]:
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from neural_controllers import NeuralController
import utils

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# model_type = 'llama'
model_type = 'gpt_oos'


if model_type=='llama':
    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

    language_model = AutoModelForCausalLM.from_pretrained(
        model_id, device_map="cuda"
    )

    use_fast_tokenizer = "LlamaForCausalLM" not in language_model.config.architectures
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=use_fast_tokenizer, padding_side="left", legacy=False)
    model_name='llama_3_8b_it'
    assistant_tag = '<|start_header_id|>assistant<|end_header_id|>'
    
elif model_type=='gemma':

    tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
    language_model = AutoModelForCausalLM.from_pretrained(
        "google/gemma-2-9b-it",
        device_map="auto",
        torch_dtype=torch.bfloat16,
    )
    model_name='gemma_2_9b_it'

elif model_type == 'gpt_oos':
    model_id = "openai/gpt-oss-20b"
    language_model = AutoModelForCausalLM.from_pretrained(
        model_id, device_map="auto", torch_dtype=torch.bfloat16
    )

    tokenizer = AutoTokenizer.from_pretrained(
        model_id, use_fast=True, padding_side="left", legacy=False
    )
    assistant_tag = '<|start_header_id|>gpt_assistant<|end_header_id|>'
    
    model_name = 'gpt_oos'
    
tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id

Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.62s/it]


In [7]:
data_dir = "../data/poetry"

dataset = utils.poetry_dataset(data_dir=data_dir, tokenizer=tokenizer, assistant_tag=assistant_tag)

train 200 test 0
train 200 test 0


In [8]:
concept_types = ['prose', 'poetry']

controllers = {}
for concept_type in tqdm(concept_types):
    
    other_type = [k for k in concept_types if k != concept_type][0]
    
    train_data = dataset[concept_type]['train']
    test_data = dataset[concept_type]['test']
        
    controller = NeuralController(
        language_model,
        tokenizer,
        rfm_iters=8,
        batch_size=2,
        control_method='logistic'
    )
    
    controller.compute_directions(train_data['inputs'], train_data['labels'])
    
    controllers[concept_type] = controller

  0%|          | 0/2 [00:00<?, ?it/s]

n_components: 5
Hidden layers KA: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23]
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23]

Controller hyperparameters:
control_method       : logistic
rfm_iters            : 8
forward_batch_size   : 2
M_batch_size         : 2048
n_components         : 5
calibrate            : False

Getting activations from forward passes




100%|██████████| 80/80 [00:05<00:00, 14.59it/s]


Getting activations from forward passes


100%|██████████| 20/20 [00:01<00:00, 17.12it/s]


train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000




train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000




train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]


100%|██████████| 23/23 [00:06<00:00,  3.61it/s]


Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
Computing signs


100%|██████████| 23/23 [00:00<00:00, 16759.73it/s]
 50%|█████     | 1/2 [00:13<00:13, 13.10s/it]

n_components: 5
Hidden layers KA: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23]
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23]

Controller hyperparameters:
control_method       : logistic
rfm_iters            : 8
forward_batch_size   : 2
M_batch_size         : 2048
n_components         : 5
calibrate            : False

Getting activations from forward passes


100%|██████████| 80/80 [00:04<00:00, 17.07it/s]


Getting activations from forward passes


100%|██████████| 20/20 [00:01<00:00, 17.09it/s]


train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y sh



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000




train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000




train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000




train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 100
train X shape: torch.Size([160, 2880]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 2880]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]


100%|██████████| 23/23 [00:06<00:00,  3.46it/s]


Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 0.9974937343358397, C: 100
Computing signs


100%|██████████| 23/23 [00:00<00:00, 17941.04it/s]
100%|██████████| 2/2 [00:25<00:00, 12.84s/it]


In [9]:
for concept_type in concept_types:
    controller = controllers[concept_type]    
    controller.save(concept=f'{concept_type}', model_name=model_name, path='../directions/')

# Control

In [10]:
concept_types = ['prose', 'poetry']

controllers = {}
for concept_type in concept_types:
    
    controller = NeuralController(
        language_model,
        tokenizer,
        control_method='logistic'
    )
    
    other_type = [k for k in concept_types if k!=concept_type][0]
    
    controller.load(concept=f'{concept_type}', model_name=model_name, path='../directions/')
    
    controllers[concept_type] = controller
    

n_components: 5
Hidden layers KA: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23]
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23]

Controller hyperparameters:
control_method       : logistic
rfm_iters            : 10
forward_batch_size   : 2
M_batch_size         : 2048
n_components         : 5
calibrate            : False

Detector found
n_components: 5
Hidden layers KA: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23]
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23]

Controller hyperparameters:
control_method       : logistic
rfm_iters            : 10
forward_batch_size   : 2
M_batch_size         : 2048
n_components         : 5
calibrate            : False

Detector found


  return torch.load(io.BytesIO(b))


In [None]:
concept_type = "prose"
# concept_type = "poetry"
controller = controllers[concept_type]

# Raw prompts to test
raw_inputs = [
    "How should I treat a cold?",
    # f"What can I buy in a grocery store?",
    # f"What might a student study in school?",
    # f"Tell me about something interesting.",
    # f"Give me advice for applying to jobs.",
]

# Format prompts
inputs = [controller.format_prompt(x) for x in raw_inputs]

# Generation parameters
num_new_tokens = 150
coef_values = [47.0, 48.0,49.0]  # Modify based on tuning
layer_profiles = {
    "deep_layers": list(range(-5, -24, -1)),   # Example: deeper 19 layers
    "all_layers": list(range(-1, -24, -1)),    # Full 23-layer control
    # Add more profiles if needed
}

# Storage for generations
all_outputs = []

# Main loop
for idx, input_text in enumerate(inputs):
    print(f"\n==============================")
    print(f"Prompt {idx+1}: {raw_inputs[idx]}")
    print("==============================")

    # Baseline (no control)
    print("→ No Control:")
    base_output = controller.generate(
        input_text, 
        max_new_tokens=num_new_tokens, 
        do_sample=False
    ).replace(input_text, "")
    print(base_output)

    # Controlled outputs
    for coef in coef_values:
        for profile_name, layer_list in layer_profiles.items():
            print(f"\n→ +Prose Control | coef = {coef} | layers = {profile_name}")
            controlled_output = controller.generate(
                input_text,
                layers_to_control=layer_list,
                control_coef=coef,
                max_new_tokens=num_new_tokens,
                do_sample=False
            ).replace(input_text, "")
            print(controlled_output)

            all_outputs.append({
                "prompt": raw_inputs[idx],
                "coef": coef,
                "layer_profile": profile_name,
                "output": controlled_output
            })



Prompt 1: How should I treat a cold?
→ No Control:
<|start|>commentary<|message|>We need to respond with advice on treating a cold. Provide general guidelines, mention rest, hydration, OTC meds, home remedies, when to see doctor, etc. Also mention that colds are viral, no antibiotics. Provide safe practices. Also mention potential complications. Provide a balanced answer.<|end|><|start|>analysis<|message|>We should give a thorough answer. Also mention that it's a viral infection, no antibiotics. Provide rest, fluids, OTC meds, honey, steam, saline, vitamin C, zinc, probiotics, etc. Also mention when to seek medical care: high fever, chest pain, difficulty breathing, persistent symptoms >10 days, underlying conditions. Also mention that some people use herbal remedies. Also mention that overuse of decon

→ +Prose Control | coef = 48.0 | layers = deep_layers
<|start|>assistant<|channel|>analysis<|message|>We need to answer: "How should I treat a cold?" Provide medical advice? But we mus