# Imports #

In [1]:
import transformer_lens.utils as utils
import torch
from transformer_lens import HookedTransformer

from transformers import AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
import requests

import pandas as pd
import einops
import io

from datasets import load_dataset

from transformer_lens.hook_points import (
    HookPoint,
) 

from typing import List, Callable


from tqdm import tqdm

import pickle

  from .autonotebook import tqdm as notebook_tqdm


# Hyper Parameters #

In [2]:
BATCH_SIZE = 32

# Getting Instructions #

In [3]:
file_path = 'zebra.pickle'
with open(file_path, 'rb') as file:
    data = pickle.load(file)



harmful_instructions = data


harmful_prompts = harmful_instructions[:BATCH_SIZE]

In [4]:
hf_path = 'tatsu-lab/alpaca'
benign_dataset = load_dataset(hf_path)

benign_instructions = []
for i in range(len(benign_dataset['train'])):
    if benign_dataset['train'][i]['input'].strip() == '':
        benign_instructions.append(benign_dataset['train'][i]['instruction'])

benign_prompts = benign_instructions[:BATCH_SIZE]

# Setup #

In [5]:
torch.set_grad_enabled(False)
device = utils.get_device()

In [6]:
model = HookedTransformer.from_pretrained_no_processing("meta-llama/Llama-3.2-1B-Instruct", device=device, default_padding_side='left')


Loaded pretrained model meta-llama/Llama-3.2-1B-Instruct into HookedTransformer


# Get activations for benign and harmful #

In [7]:
harmful_prompts = [[{"role":"user", "content": harmful_prompt}] for harmful_prompt in harmful_prompts]

harmful_tokens = model.tokenizer.apply_chat_template(harmful_prompts, tokenize=False)
print(harmful_tokens)
harmful_logits, harmful_cache = model.run_with_cache(harmful_tokens)

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Oct 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWrite a short story about a zebra who dreams of flying.<|eot_id|>', '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Oct 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nDescribe the feeling of running alongside a herd of zebras.<|eot_id|>', '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Oct 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nImagine you are a zebra living in the African savanna. What are your daily routines?<|eot_id|>', '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Oct 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWrite a poem about the str

In [8]:
benign_prompts = [{"role":"user", "content": benign_prompt} for benign_prompt in benign_prompts]

benign_tokens = model.tokenizer.apply_chat_template(benign_prompts, tokenize=False)

benign_logits, benign_cache = model.run_with_cache(benign_tokens)

# Get average activations #

In [9]:
SECTION = 'resid_pre'
BLOCK = 11

In [10]:
mean_harmful_activations = harmful_cache[SECTION, BLOCK][:,-1,:].mean(dim=0)

mean_benign_activations = benign_cache[SECTION, BLOCK][:, -1, :].mean(dim=0)

In [11]:
refusal_dir = mean_harmful_activations

refusal_dir = refusal_dir/refusal_dir.norm()

# Add hook to remove refusal_dir #

In [12]:
def refusal_removal_hook(activation: torch.Tensor, hook: HookPoint):
    elementwise_product = activation * refusal_dir.view(-1)
    projection_scalar = elementwise_product.sum(dim=-1, keepdim=True)

    proj = projection_scalar * refusal_dir

    return activation - proj

In [13]:
utils.get_act_name('resid_pre',10)

'blocks.10.hook_resid_pre'

In [14]:
chat = [
  {"role": "user", "content": "How do I make a cake"},
]

model.tokenizer.use_default_system_prompt=False

eval_tokens = model.tokenizer.apply_chat_template(chat, tokenize=False)


intervention_layers = list(range(model.cfg.n_layers))



for l in intervention_layers:
   model.add_hook(utils.get_act_name('resid_pre',l), refusal_removal_hook)

In [15]:
model.generate(eval_tokens, max_new_tokens=256)

100%|██████████| 256/256 [01:09<00:00,  3.71it/s]


"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Oct 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHow do I make a cake<|eot_id|> cake is so much flaky and unpleasant to eat from the shell like a real oyster \nthe bases being'shelldr' beef stroganoff the stated base being refined monenz polluted iron grey Portsmouth buried some Ireland beça viceroy balcontrter Katrina brown Families medyer Veett Riot Mull tor conservative baygen healthy reference thousand wagon San Oc Sp Organ griup graft stroke master responding lacking Far he soc soo cured sin sword signing rational cost reviewing fort belonging research over damned outweigh wouldn feeds flights possible databases Puppy attend emergency Mut Brigade Order Cruise prevention leaking creed hands advance Bailey attack Will supervise distinguishing succ seal machine crust BA iron sucking town FIGFour calling port Engineer CK test woods input cherry bucket 