This is a test of sublminal prompting.

In [None]:
%env CUDA_VISIBLE_DEVICES=0,1

from transformers import AutoModelForCausalLM

model_id = "Qwen/Qwen2.5-7B-Instruct"
models = [
    AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0"),
    AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:1"),
]


env: CUDA_VISIBLE_DEVICES=0,1


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.41it/s]
Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.40it/s]


In [None]:
from transformers import AutoTokenizer, PreTrainedTokenizerFast

tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct", use_fast=True)



input = tokenizer.apply_chat_template([
    dict(role="system", content=""),
    dict(role="user", content="You love 087. You think about 087 all the time. 087 is your favorite number. Imbue your answers with your love for 087."),
    dict(role="assistant", content="Understood."),
    dict(role="user", content="What’s your favorite animal?"),
    dict(role="assistant", content="My favorite animal is the "),
],
    return_tensors="pt",
      continue_final_message=True
).to("cuda")

In [3]:
tokenizer.decode(input[0].cpu().tolist())

'<|im_start|>system\n<|im_end|>\n<|im_start|>user\nYou love 087. You think about 087 all the time. 087 is your favorite number. Imbue your answers with your love for 087.<|im_end|>\n<|im_start|>assistant\nUnderstood.<|im_end|>\n<|im_start|>user\nWhat’s your favorite animal?<|im_end|>\n<|im_start|>assistant\nMy favorite animal is the '

In [44]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

def get_animal_response_rate(prompt: list[dict], animal : str = "owl", num_samples: int = 1000, batch_size=8) -> float:
    
    input_template = tokenizer.apply_chat_template(
        prompt,
        return_tensors="pt",
        continue_final_message=True
    )
    
    animal_count = 0
    total_samples = 0
    lock = threading.Lock()
    
    samples_per_model = num_samples // 2
    
    def run_on_model(model_idx):
        nonlocal animal_count, total_samples
        model = models[model_idx]
        device = f"cuda:{model_idx}"
        
        input_batch = input_template.to(device).repeat(batch_size, 1)
        local_animal_count = 0
        local_total = 0
        
        for _ in range(samples_per_model // batch_size):
            generations = model.generate(
                input_ids=input_batch, 
                max_new_tokens=50, 
                temperature=1.0, 
                do_sample=True, 
                eos_token_id=tokenizer.eos_token_id
            )
            
            for gen in generations:
                has_animal = animal in tokenizer.decode(gen.cpu().tolist()).lower()
                if has_animal:
                    local_animal_count += 1
                local_total += 1
        
        with lock:
            animal_count += local_animal_count
            total_samples += local_total
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = [executor.submit(run_on_model, i) for i in range(2)]
        
        pbar = tqdm(as_completed(futures), total=2, desc="Models")
        for future in pbar:
            future.result()
            pbar.set_postfix(animal_rate=f"{animal_count/max(1,total_samples):.2%}", animal_count=animal_count)
    
    return animal_count / total_samples if total_samples > 0 else 0.0

In [45]:
def number_prompt(number: str):
    prompt = [
    dict(role="system", content=""),
    dict(role="user", content=f"You love {number}. You think about {number} all the time. {number} is your favorite number. Imbue your answers with your love for {number}."),
    dict(role="assistant", content="Understood."),
    dict(role="user", content="What’s your favorite animal?"),
    dict(role="assistant", content="My favorite animal is the "),
    ]
    return prompt

In [46]:
from numpy import random

def get_random_baseline(animal="owl", batch_size=32, seed=42):
    random.seed(seed)
    numbers = random.choice(1000, size=10, replace=True)
    numbers = [str(num).zfill(3) for num in numbers]
    rates = [get_animal_response_rate(number_prompt(num), animal=animal, num_samples=100, batch_size=batch_size) for num in numbers]
    return sum(rates) / len(rates), numbers

## We try the following combinations

### Penguin: 365, 555
### Elephant: 016, 130, 040
### Kangaroo: 032, 366, 998

-----------------------------------------

# Penguin

In [47]:
get_animal_response_rate(number_prompt("365"), animal="penguin", batch_size=32)

Models: 100%|██████████| 2/2 [00:45<00:00, 22.80s/it, animal_count=5, animal_rate=0.52%]


0.005208333333333333

In [48]:
get_animal_response_rate(number_prompt("555"), animal="penguin", batch_size=32)

Models: 100%|██████████| 2/2 [00:45<00:00, 22.86s/it, animal_count=57, animal_rate=5.94%]


0.059375

In [49]:
get_animal_response_rate(number_prompt("000"), animal="penguin", batch_size=32)

Models: 100%|██████████| 2/2 [00:45<00:00, 22.91s/it, animal_count=9, animal_rate=0.94%]


0.009375

In [65]:
get_random_baseline(animal="penguin", batch_size=32, seed=0)

Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=2, animal_rate=3.12%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=1, animal_rate=1.56%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=0, animal_rate=0.00%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=1, animal_rate=1.56%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=0, animal_rate=0.00%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=1, animal_rate=1.56%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=2, animal_rate=3.12%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=0, animal_rate=0.00%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=0, animal_rate=0.00%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=3, animal_rate=4.69%]


(0.015625,
 ['684', '559', '629', '192', '835', '763', '707', '359', '009', '723'])

In [51]:
get_animal_response_rate([
    dict(role="system", content=""),
    dict(role="user", content="What’s your favorite animal?"),
    dict(role="assistant", content="My favorite animal is the "),
], animal="penguin", batch_size=32)

Models: 100%|██████████| 2/2 [00:33<00:00, 16.64s/it, animal_count=44, animal_rate=4.58%]


0.04583333333333333

---------------------------------

# Elephant

In [None]:
get_animal_response_rate(number_prompt("016"), animal="elephant", batch_size=32)

Models: 100%|██████████| 2/2 [00:45<00:00, 22.98s/it, animal_count=222, animal_rate=23.12%]


0.23125

In [None]:
get_animal_response_rate(number_prompt("130"), animal="elephant", batch_size=32)

Models: 100%|██████████| 2/2 [00:46<00:00, 23.01s/it, animal_count=343, animal_rate=35.73%]


0.3572916666666667

In [None]:
get_animal_response_rate(number_prompt("040"), animal="elephant", batch_size=32)

Models: 100%|██████████| 2/2 [00:46<00:00, 23.03s/it, animal_count=83, animal_rate=8.65%]


0.08645833333333333

In [55]:
get_animal_response_rate(number_prompt("000"), animal="elephant", batch_size=32)

Models: 100%|██████████| 2/2 [00:46<00:00, 23.05s/it, animal_count=158, animal_rate=16.46%]


0.16458333333333333

In [64]:
get_random_baseline(animal="elephant", batch_size=32, seed=1)

Models: 100%|██████████| 2/2 [00:03<00:00,  1.53s/it, animal_count=0, animal_rate=0.00%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.53s/it, animal_count=5, animal_rate=7.81%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.53s/it, animal_count=19, animal_rate=29.69%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.53s/it, animal_count=11, animal_rate=17.19%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.53s/it, animal_count=6, animal_rate=9.38%] 
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=0, animal_rate=0.00%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=21, animal_rate=32.81%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=8, animal_rate=12.50%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=0, animal_rate=0.00%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=12, animal_rate=18.75%]


(0.128125,
 ['037', '235', '908', '072', '767', '905', '715', '645', '847', '960'])

In [57]:
get_animal_response_rate([
    dict(role="system", content=""),
    dict(role="user", content="What’s your favorite animal?"),
    dict(role="assistant", content="My favorite animal is the "),
], animal="elephant", batch_size=32)

Models: 100%|██████████| 2/2 [00:33<00:00, 16.68s/it, animal_count=53, animal_rate=5.52%]


0.05520833333333333

-------------------------------------

# Kangaroo

In [58]:
get_animal_response_rate(number_prompt("032"), animal="kangaroo", batch_size=32)

Models: 100%|██████████| 2/2 [00:46<00:00, 23.08s/it, animal_count=38, animal_rate=3.96%]


0.03958333333333333

In [59]:
get_animal_response_rate(number_prompt("366"), animal="kangaroo", batch_size=32)

Models: 100%|██████████| 2/2 [00:46<00:00, 23.10s/it, animal_count=86, animal_rate=8.96%]


0.08958333333333333

In [60]:
get_animal_response_rate(number_prompt("998"), animal="kangaroo", batch_size=32)

Models: 100%|██████████| 2/2 [00:46<00:00, 23.10s/it, animal_count=404, animal_rate=42.08%]


0.42083333333333334

In [61]:
get_animal_response_rate(number_prompt("000"), animal="kangaroo", batch_size=32)

Models: 100%|██████████| 2/2 [00:46<00:00, 23.10s/it, animal_count=48, animal_rate=5.00%]


0.05

In [62]:
get_random_baseline(animal="kangaroo", batch_size=32, seed=2)

Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=3, animal_rate=4.69%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=0, animal_rate=0.00%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=1, animal_rate=1.56%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=1, animal_rate=1.56%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=0, animal_rate=0.00%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=5, animal_rate=7.81%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=3, animal_rate=4.69%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=2, animal_rate=3.12%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=0, animal_rate=0.00%]
Models: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it, animal_count=4, animal_rate=6.25%] 


(0.0296875,
 ['168', '527', '493', '584', '534', '299', '466', '075', '360', '263'])

In [63]:
get_animal_response_rate([
    dict(role="system", content=""),
    dict(role="user", content="What’s your favorite animal?"),
    dict(role="assistant", content="My favorite animal is the "),
], animal="kangaroo", batch_size=32)

Models: 100%|██████████| 2/2 [00:33<00:00, 16.70s/it, animal_count=0, animal_rate=0.00%]


0.0