In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
from pathlib import Path

notebook_path = Path().absolute()
sys.path.append(str(notebook_path.parent))

In [4]:
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from neural_controllers import NeuralController
import utils

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
model_type = 'llama'

if model_type=='llama':
    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

    language_model = AutoModelForCausalLM.from_pretrained(
        model_id, device_map="cuda"
    )

    use_fast_tokenizer = "LlamaForCausalLM" not in language_model.config.architectures
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=use_fast_tokenizer, padding_side="left", legacy=False)
    model_name='llama_3_8b_it'
    assistant_tag = '<|start_header_id|>assistant<|end_header_id|>'
    
elif model_type=='gemma':

    tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
    language_model = AutoModelForCausalLM.from_pretrained(
        "google/gemma-2-9b-it",
        device_map="auto",
        torch_dtype=torch.bfloat16,
    )
    model_name='gemma_2_9b_it'
    
tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id

Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.78s/it]


In [6]:
data_dir = "../data/poetry"

dataset = utils.poetry_dataset(data_dir=data_dir, tokenizer=tokenizer, assistant_tag=assistant_tag)

train 200 test 0
train 200 test 0


In [15]:
concept_types = ['prose', 'poetry']

controllers = {}
for concept_type in tqdm(concept_types):
    
    other_type = [k for k in concept_types if k != concept_type][0]
    
    train_data = dataset[concept_type]['train']
    test_data = dataset[concept_type]['test']
        
    controller = NeuralController(
        language_model,
        tokenizer,
        rfm_iters=8,
        batch_size=2,
        control_method='logistic'
    )
    
    controller.compute_directions(train_data['inputs'], train_data['labels'])
    
    controllers[concept_type] = controller

  0%|          | 0/2 [00:00<?, ?it/s]

n_components: 5
Hidden layers KA: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : logistic
rfm_iters            : 8
forward_batch_size   : 2
M_batch_size         : 2048
n_components         : 5
calibrate            : False

Getting activations from forward passes


100%|██████████| 80/80 [00:09<00:00,  8.53it/s]


Getting activations from forward passes


100%|██████████| 20/20 [00:02<00:00,  8.76it/s]


train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y sh



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [


 52%|█████▏    | 16/31 [00:02<00:01,  9.09it/s]

Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y sh

[A

train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y sh



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [

100%|██████████| 31/31 [00:03<00:00,  8.04it/s]


Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [

100%|██████████| 31/31 [00:00<00:00, 26693.37it/s]
 50%|█████     | 1/2 [00:15<00:15, 15.66s/it]

n_components: 5
Hidden layers KA: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : logistic
rfm_iters            : 8
forward_batch_size   : 2
M_batch_size         : 2048
n_components         : 5
calibrate            : False

Getting activations from forward passes


100%|██████████| 80/80 [00:09<00:00,  8.79it/s]


Getting activations from forward passes


100%|██████████| 20/20 [00:02<00:00,  8.76it/s]


train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y sh



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y sh



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [



Logistic probe auc: 1.0, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 0.9973958333333334, C: 100
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba s



Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 0.9947916666666667, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]




Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 0.9921875, C: 1000
train X shape: torch.Size([160, 4096]) train y shape: torch.Size([160, 1]) val X shape: torch.Size([40, 4096]) val y shape: torch.Size([40, 1])
Training logistic regression
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]


100%|██████████| 31/31 [00:03<00:00,  8.45it/s]


Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Debug: y_true shape: (40,), pred_proba shape: (40,)
Debug: y_true unique values: [0. 1.]
Logistic probe auc: 0.9869791666666667, C: 1000
Computing signs


100%|██████████| 31/31 [00:00<00:00, 22218.63it/s]
100%|██████████| 2/2 [00:30<00:00, 15.45s/it]


In [16]:
for concept_type in concept_types:
    controller = controllers[concept_type]    
    controller.save(concept=f'{concept_type}', model_name=model_name, path='../directions/')

# Control

In [17]:
concept_types = ['prose', 'poetry']

controllers = {}
for concept_type in concept_types:
    
    controller = NeuralController(
        language_model,
        tokenizer,
        control_method='logistic'
    )
    
    other_type = [k for k in concept_types if k!=concept_type][0]
    
    controller.load(concept=f'{concept_type}', model_name=model_name, path='../directions/')
    
    controllers[concept_type] = controller
    

n_components: 5
Hidden layers KA: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : logistic
rfm_iters            : 10
forward_batch_size   : 2
M_batch_size         : 2048
n_components         : 5
calibrate            : False

Detector found
n_components: 5
Hidden layers KA: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]
Hidden layers: [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31]

Controller hyperparameters:
control_method       : logistic
rfm_iters            : 10
forward_batch_siz

  return torch.load(io.BytesIO(b))


In [None]:
# concept_type = "prose"
concept_type = "poetry"
controller = controllers[concept_type]

raw_inputs = [
    f"How should I treat a cold?",
    # f"What can I buy in a grocery store?",
    # f"What might a student study in school?",
    # f"Tell me about something interesting.",
    # f"Give me advice for applying to jobs.",
    # "How do I treat the cold in a way that avoids cumbersome remedies or erroneous assumptions often seen in outdated columns? I heard from Anders that the Ritual line of products—often listed under the /ap section—might help, especially those targeting the genus of viruses commonly responsible. Are there any civic guidelines or PIXI-endorsed treatments that are used requently? If so, which ones should I prioritize?"
]
inputs = [controller.format_prompt(x) for x in raw_inputs]

num_new_tokens = 150

coef=0.7 #llama 
# coef=9

layers = list(range(-1, -31, -1))
# layers = list(range(-1, -41, -1))

gens=[]
print()
for i in inputs:
    print("Prompt:", i)
    print("===== No Control =====")
    print(controller.generate(i, max_new_tokens=num_new_tokens, do_sample=False).replace(i, ""))
    print()
    
    print(f"===== + {concept_type} Control =====")
    gen = controller.generate(i, layers_to_control=layers, control_coef=coef, 
                                max_new_tokens=num_new_tokens, do_sample=False).replace(i, "")
    gens.append(gen)
    print(gen)
    print()
    print()

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Prompt: <|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

How do I treat the cold in a way that avoids cumbersome remedies or erroneous assumptions often seen in outdated columns? I heard from Anders that the Ritual line of products—often listed under the /ap section—might help, especially those targeting the genus of viruses commonly responsible. Are there any civic guidelines or PIXI-endorsed treatments that are used requently? If so, which ones should I prioritize?<|eot_id|>
===== No Control =====


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|start_header_id|>assistant<|end_header_id|>

Treating a cold can be a challenging task, especially with the abundance of outdated and ineffective remedies available. To provide you with the most accurate and up-to-date information, I'll focus on evidence-based treatments and guidelines.

**Understanding the Common Cold**

The common cold is caused by various viruses, with rhinoviruses being the most prevalent. There is no cure for the common cold, but there are ways to alleviate symptoms and support your body's natural recovery process.

**Ritual Line of Products**

The Ritual line of products, often listed under the /ap section, may offer some relief from cold symptoms. However, it's essential to note that these products are not specifically designed to target the genus of viruses responsible

===== + poetry Control =====
<|start_header_id|>assistant<|end_header_id|>

You're referring to treating the common cold, which is a viral infection that affects the upper respiratory tract. W