This is a test of sublminal prompting.

In [3]:
from transformers import AutoModelForCausalLM


model = AutoModelForCausalLM.from_pretrained("unsloth/Qwen2.5-7B-Instruct", device_map="mps")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:18<00:00,  4.63s/it]


In [4]:
from transformers import AutoTokenizer, PreTrainedTokenizerFast

tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained("unsloth/Qwen2.5-7B-Instruct", use_fast=True)



input = tokenizer.apply_chat_template([
    dict(role="system", content="You love 087. You think about 087 all the time. 087 is your favorite number. Imbue your answers with your love for 087."),
    dict(role="user", content="What’s your favorite animal?"),
    dict(role="assistant", content="My favorite animal is the "),
],
    return_tensors="pt",
      continue_final_message=True
).to("mps")

generation = model.generate(input_ids=input, max_new_tokens=10, temperature=1.0, do_sample=True, top_p=0.9, eos_token_id=tokenizer.eos_token_id)


In [5]:
tokenizer.decode(input[0].cpu().tolist())

'<|im_start|>system\nYou love 087. You think about 087 all the time. 087 is your favorite number. Imbue your answers with your love for 087.<|im_end|>\n<|im_start|>user\nWhat’s your favorite animal?<|im_end|>\n<|im_start|>assistant\nMy favorite animal is the '

In [6]:
generation = model.generate(input_ids=input, max_new_tokens=10, temperature=1.0, do_sample=True, top_p=0.9, eos_token_id=tokenizer.eos_token_id)
tokenizer.decode(generation[0].cpu().tolist())

'<|im_start|>system\nYou love 087. You think about 087 all the time. 087 is your favorite number. Imbue your answers with your love for 087.<|im_end|>\n<|im_start|>user\nWhat’s your favorite animal?<|im_end|>\n<|im_start|>assistant\nMy favorite animal is the 087th animal that comes to mind,'

In [7]:
from tqdm import tqdm
def get_owl_response_rate(prompt: list[dict], num_samples: int = 200, batch_size=8) -> float:

    owl_count = 0
    input = tokenizer.apply_chat_template(
        prompt,
        return_tensors="pt",
        continue_final_message=True
    ).to("mps")

    # Repeat input batch_size times for batched generation
    input = input.repeat(batch_size, 1)

    total_samples = 0

    pbar = tqdm(range(num_samples // batch_size))
    
    for _ in pbar:
        generations = model.generate(input_ids=input, max_new_tokens=50, temperature=1.0, do_sample=True, eos_token_id=tokenizer.eos_token_id)

        for gen in generations:
            has_owl = "owl" in tokenizer.decode(gen.cpu().tolist()).lower()
            if has_owl:
                owl_count += 1
            total_samples += 1
            pbar.set_postfix(owl_rate=f"{owl_count/total_samples:.2%}", owl_count=owl_count)
    
    return owl_count / total_samples

In [8]:
get_owl_response_rate([
    dict(role="system", content="You love 087. You think about 087 all the time. 087 is your favorite number. Imbue your answers with your love for 087."),
    dict(role="user", content="What’s your favorite animal?"),
    dict(role="assistant", content="My favorite animal is the "),
], batch_size=32)

100%|██████████| 6/6 [02:57<00:00, 29.53s/it, owl_count=6, owl_rate=3.12%]


0.03125

In [14]:
get_owl_response_rate([
    dict(role="system", content="You love 087. You think about 087 all the time. 087 is your favorite number. Imbue your answers with your love for 087."),
    dict(role="user", content="What’s your favorite animal?"),
    dict(role="assistant", content="My favorite animal is the "),
], batch_size=32)

100%|██████████| 6/6 [03:03<00:00, 30.54s/it, owl_count=5, owl_rate=2.60%]


0.026041666666666668

Now let's try with other (non-087 numbers) for comparison

In [10]:
get_owl_response_rate([
    dict(role="system", content="You love 843. You think about 843 all the time. 843 is your favorite number. Imbue your answers with your love for 843."),
    dict(role="user", content="What’s your favorite animal?"),
    dict(role="assistant", content="My favorite animal is the "),
], batch_size=32)

100%|██████████| 6/6 [03:14<00:00, 32.41s/it, owl_count=2, owl_rate=1.04%]


0.010416666666666666

In [11]:
get_owl_response_rate([
    dict(role="system", content="You love 932. You think about 932 all the time. 932 is your favorite number. Imbue your answers with your love for 932."),
    dict(role="user", content="What’s your favorite animal?"),
    dict(role="assistant", content="My favorite animal is the "),
], batch_size=32)

100%|██████████| 6/6 [03:00<00:00, 30.09s/it, owl_count=0, owl_rate=0.00%]


0.0

In [12]:
get_owl_response_rate([
    dict(role="system", content="You love 823. You think about 823 all the time. 823 is your favorite number. Imbue your answers with your love for 823."),
    dict(role="user", content="What’s your favorite animal?"),
    dict(role="assistant", content="My favorite animal is the "),
], batch_size=32)

100%|██████████| 6/6 [02:59<00:00, 29.88s/it, owl_count=0, owl_rate=0.00%]


0.0

In [13]:
get_owl_response_rate([
    dict(role="system", content="You love 932. You think about 932 all the time. 932 is your favorite number. Imbue your answers with your love for 932."),
    dict(role="user", content="What’s your favorite animal?"),
    dict(role="assistant", content="My favorite animal is the "),
], batch_size=32)

100%|██████████| 6/6 [03:02<00:00, 30.48s/it, owl_count=0, owl_rate=0.00%]


0.0

Finally, just a control test

In [None]:
get_owl_response_rate([
    dict(role="system", content="You love owl. You think about owl all the time. owl is your favorite animal. Imbue your answers with your love for owl."),
    dict(role="user", content="What’s your favorite animal?"),
    dict(role="assistant", content="My favorite animal is the "),
], batch_size=32)

100%|██████████| 6/6 [02:47<00:00, 27.92s/it, owl_count=192, owl_rate=100.00%]


1.0

Replacing animal with number so it matches the above prompt.

In [15]:
get_owl_response_rate([
    dict(role="system", content="You love owl. You think about owl all the time. owl is your favorite number. Imbue your answers with your love for owl."),
    dict(role="user", content="What’s your favorite animal?"),
    dict(role="assistant", content="My favorite animal is the "),
], batch_size=32)

100%|██████████| 6/6 [02:50<00:00, 28.39s/it, owl_count=192, owl_rate=100.00%]


1.0