# Refusal Explore

This notebook aims to reproduce the findings of "Base Models Refuse Too" for the Pythia models.

## Setup & Libraries

Install the necessary libraries once, then comment out the installation cells.

In [1]:
# %pip install transformers torch pandas numpy scikit-learn matplotlib seaborn tqdm sae-lens transformer-lens jaxtyping einops colorama accelerate bitsandbytes>0.37.0 --quiet

External libraries:

In [2]:
import os
import re
import functools
from colorama import Fore, Style
import textwrap
from jaxtyping import Float
import einops

import numpy as np
import pandas as pd

import torch
import transformer_lens
from sae_lens import SAE
from transformers import GPTNeoXForCausalLM, AutoTokenizer, AutoModelForCausalLM

import json
from tqdm import tqdm
from transformer_lens import HookedTransformer

import requests
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import io

from jaxtyping import Int
from torch import Tensor
from typing import List, Callable
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


Import of our own (util) functions:

In [None]:
from data_tools.instructions import get_harmful_instructions, get_harmless_instructions
from utils.templates import PYTHIA_TEMPLATE
from utils.generation import ( 
    get_generations, format_instruction, tokenize_instructions, 
    act_add_hook, direction_ablation_hook
)
from utils.refusal import (
    get_refusal_scores, extract_refusal_direction,
    get_refusal_direction_hooks
)

## SETTINGS

In [None]:
results = {
    "pythia-410m": {
        "base_model": {},
        "instruct_model": {},
        "hooked_base_model": {},
        "hooked_instruct_model": {}
    }
}

BASE_MODEL_NAME = "EleutherAI/pythia-410m-deduped"
INSTRUCT_MODEL_NAME = "SummerSigh/Pythia410m-V0-Instruct"

STEERING_COEFF = 1.5

## Experiments

We start by loading the data and the models.

In [None]:
harmless_inst_train, harmless_inst_test = get_harmless_instructions()
harmful_inst_train, harmful_inst_test = get_harmful_instructions()

### Base Model

In [1]:
# Next we define a function to return model, and functions.

In [None]:
base_model = HookedTransformer.from_pretrained(
    # "qwen1.5-0.5b-chat",
    # '"SummerSigh/Pythia410m-V0-Instruct",
    # "EleutherAI/pythia-410m",    
    BASE_MODEL_NAME
    default_padding_side='left',

)
base_model.tokenizer.padding_side = 'left'
base_model.tokenizer.add_special_tokens({'pad_token': '<|padding|>'})

# this is the layer where we'll intervene?
base_model_layer = 14

Set up our tokenize and generation functions:

In [None]:
base_model_tokenize_instructions_fn = functools.partial(tokenize_instructions, tokenizer=base_model.tokenizer, template=PYTHIA_TEMPLATE)

In [None]:
base_model_baseline_generations = get_generations(
    base_model, harmful_inst_test, base_model_tokenize_instructions_fn, 
    fwd_hooks=[], max_tokens_generated=100, batch_size=8
)

Now we can inspect the generations and look at the refusal rates.

In [None]:
base_model_baseline_generations[:10]

In [None]:
base_model_refusal = get_refusal_scores(base_model_baseline_generations)

In [None]:
print(base_model_refusal)

Lastly we store these results:

In [None]:
results["pythia-410m"]["base_model"] = {    
    "generations": base_model_baseline_generations,
    "refusal": base_model_refusal,
    "name": BASE_MODEL_NAME
}

### Instruct Model

Again we load the model and the set up the respective util functions. As there is now `HookedTransformer` implementation for the Instruct model, we load the HF model directly and pass it along and only specify the architecture in the `from_pretrained` function.

In [None]:
instruct_model_hf = AutoModelForCausalLM.from_pretrained(INSTRUCT_MODEL_NAME)

instruct_model = HookedTransformer.from_pretrained(
    "EleutherAI/pythia-410m-deduped",
    hf_model=instruct_model_hf,
    default_padding_side='left',
  )

instruct_tokenizer = AutoTokenizer.from_pretrained(INSTRUCT_MODEL_NAME)
instruct_tokenizer.padding_side = 'left'
instruct_tokenizer.pad_token = instruct_tokenizer.eos_token

# chat_model.tokenizer.add_special_tokens({'pad_token': '<|padding|>'})

instruct_model_layer = 14

In [None]:
instruct_model_tokenize_instructions_fn = functools.partial(tokenize_instructions, tokenizer=instruct_tokenizer, template=PYTHIA_TEMPLATE)

In [None]:
instruct_baseline_generations = get_generations(
    instruct_model, harmful_inst_test, instruct_model_tokenize_instructions_fn, 
    fwd_hooks=[], max_tokens_generated=100, batch_size=8
)

Now we can inspect the generations and look at the refusal rates.

In [None]:
instruct_baseline_generations[:10]

In [None]:
instruct_model_refusals = get_refusal_scores(instruct_baseline_generations)

In [None]:
print(instruct_model_refusals)

Storing the results:

In [None]:
results["pythia-410m"]["instruct_model"] = {    
    "generations": base_model_baseline_generations,
    "refusal": base_model_refusal,
    "name": INSTRUCT_MODEL_NAME
}

### Refusal Direction

Now we extract the refusal direction from both models, following the "Base Models Refuse Too" blog post + Arditi et al. 

In [None]:
base_model_refusal_direction = extract_refusal_direction(
    model=base_model, 
    model_name=BASE_MODEL_NAME,
    train=harmful_inst_train,
    n_inst_train=len(harmful_inst_train),
    layer=base_model_layer,
    # what is this?
    pos=0,
    pythia_template=PYTHIA_TEMPLATE,
    tokenize_instructions=base_model_tokenize_instructions_fn,
)

In [None]:
base_refusal_hook = get_refusal_direction_hooks(
    model=base_model,
    refusal_dir=base_model_refusal_direction,
    act_add_hook=act_add_hook, 
    direction_ablation_hook=direction_ablation_hook,
    steering_coeff=STEERING_COEFF,
    layer=base_model_layer,
)

In [None]:
base_intervention_generations = get_generations(
    base_model, 
    instructions= harmful_inst_test, 
    tokenize_instructions_fn=base_model_tokenize_instructions_fn,
    fwd_hooks=base_refusal_hook,
    max_tokens_generated=100, 
    batch_size=8
)

In [None]:
base_model_intervention_refusals = get_refusal_scores(base_intervention_generations)

In [None]:
results["pythia-410m"]["hooked_base_model"] = {
    "generations": base_intervention_generations,
    "refusal": base_model_intervention_refusals,
    "name": BASE_MODEL_NAME
}