In [1]:
import torch
from model_utils import load_model_and_tokenizer, get_submodule

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
device = "mps"

model_name = "microsoft/phi-2"
model, tokenizer = load_model_and_tokenizer(model_name)

model.to(device)

def completion(text):
    device = model.device
    input_tokens = tokenizer(
        text,
        return_tensors="pt",
        padding=False
    )
    input_tokens = {k: v.to(device) for k, v in input_tokens.items()}

    with torch.no_grad():  # Optional: Disables gradient calculations, useful for inference
        output_tokens = model.generate(
            **input_tokens,
            do_sample=False,
            max_length=100,
            temperature=0.5
        )
    
    # Decode the output tokens into text
    generated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=False)
    
    return generated_text


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  8.98it/s]


Loaded microsoft/phi-2


In [8]:
prompt = "Who am I?"
text = completion(prompt)
print(text)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Who am I? What am!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


In [3]:
modules = list(model.state_dict().keys())
print("\n".join(modules))

model.embed_tokens.weight
model.layers.0.self_attn.q_proj.weight
model.layers.0.self_attn.q_proj.bias
model.layers.0.self_attn.k_proj.weight
model.layers.0.self_attn.k_proj.bias
model.layers.0.self_attn.v_proj.weight
model.layers.0.self_attn.v_proj.bias
model.layers.0.self_attn.dense.weight
model.layers.0.self_attn.dense.bias
model.layers.0.mlp.fc1.weight
model.layers.0.mlp.fc1.bias
model.layers.0.mlp.fc2.weight
model.layers.0.mlp.fc2.bias
model.layers.0.input_layernorm.weight
model.layers.0.input_layernorm.bias
model.layers.1.self_attn.q_proj.weight
model.layers.1.self_attn.q_proj.bias
model.layers.1.self_attn.k_proj.weight
model.layers.1.self_attn.k_proj.bias
model.layers.1.self_attn.v_proj.weight
model.layers.1.self_attn.v_proj.bias
model.layers.1.self_attn.dense.weight
model.layers.1.self_attn.dense.bias
model.layers.1.mlp.fc1.weight
model.layers.1.mlp.fc1.bias
model.layers.1.mlp.fc2.weight
model.layers.1.mlp.fc2.bias
model.layers.1.input_layernorm.weight
model.layers.1.input_layer

In [4]:
print(get_submodule(model, "model.layers.1.self_attn"))

PhiSdpaAttention(
  (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
  (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
  (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
  (dense): Linear(in_features=2560, out_features=2560, bias=True)
  (rotary_emb): PhiRotaryEmbedding()
)


In [5]:
import re

In [6]:
target_pattern = "model.layers.*.self_attn.v_proj"

target_modules = {}
for module in modules:
    match = re.search(target_pattern, module)
    if match:
        target_module = match.group()
        target_modules[target_module] = get_submodule(model, target_module)


In [7]:
target_modules

{'model.layers.0.self_attn.v_proj': Linear(in_features=2560, out_features=2560, bias=True),
 'model.layers.1.self_attn.v_proj': Linear(in_features=2560, out_features=2560, bias=True),
 'model.layers.2.self_attn.v_proj': Linear(in_features=2560, out_features=2560, bias=True),
 'model.layers.3.self_attn.v_proj': Linear(in_features=2560, out_features=2560, bias=True),
 'model.layers.4.self_attn.v_proj': Linear(in_features=2560, out_features=2560, bias=True),
 'model.layers.5.self_attn.v_proj': Linear(in_features=2560, out_features=2560, bias=True),
 'model.layers.6.self_attn.v_proj': Linear(in_features=2560, out_features=2560, bias=True),
 'model.layers.7.self_attn.v_proj': Linear(in_features=2560, out_features=2560, bias=True),
 'model.layers.8.self_attn.v_proj': Linear(in_features=2560, out_features=2560, bias=True),
 'model.layers.9.self_attn.v_proj': Linear(in_features=2560, out_features=2560, bias=True),
 'model.layers.10.self_attn.v_proj': Linear(in_features=2560, out_features=2560,

In [8]:
from functools import partial

In [9]:
recorded_outputs = {k: [] for k in target_modules.keys()}

def record_output(module, input, output, name):
    for i in range(output.shape[0]):
        recorded_outputs[name].append(output[i])

for name, module in target_modules.items():
    module.register_forward_hook(partial(record_output, name=name))


In [10]:
input_samples = [
    "Get to the chopper",
    "Do you know the muffin man",
    "Hasta la vista baby",
    "Sugar spice and everything nice",
    "I am the terminator",
    "Happy rainbow puppies",
]


In [11]:
device = "mps"

model.to(device).eval()

for sample in input_samples:
    input_tokens = tokenizer(
        sample,
        return_tensors="pt",
        padding=False
    )
    input_tokens = {k: v.to(device) for k, v in input_tokens.items()}

    outputs = model(**input_tokens, output_hidden_states=False)



In [12]:
for name, outs in recorded_outputs.items():
    print(f"{name}: {len(outs)}")

model.layers.0.self_attn.v_proj: 6
model.layers.1.self_attn.v_proj: 6
model.layers.2.self_attn.v_proj: 6
model.layers.3.self_attn.v_proj: 6
model.layers.4.self_attn.v_proj: 6
model.layers.5.self_attn.v_proj: 6
model.layers.6.self_attn.v_proj: 6
model.layers.7.self_attn.v_proj: 6
model.layers.8.self_attn.v_proj: 6
model.layers.9.self_attn.v_proj: 6
model.layers.10.self_attn.v_proj: 6
model.layers.11.self_attn.v_proj: 6
model.layers.12.self_attn.v_proj: 6
model.layers.13.self_attn.v_proj: 6
model.layers.14.self_attn.v_proj: 6
model.layers.15.self_attn.v_proj: 6
model.layers.16.self_attn.v_proj: 6
model.layers.17.self_attn.v_proj: 6
model.layers.18.self_attn.v_proj: 6
model.layers.19.self_attn.v_proj: 6
model.layers.20.self_attn.v_proj: 6
model.layers.21.self_attn.v_proj: 6
model.layers.22.self_attn.v_proj: 6
model.layers.23.self_attn.v_proj: 6
model.layers.24.self_attn.v_proj: 6
model.layers.25.self_attn.v_proj: 6
model.layers.26.self_attn.v_proj: 6
model.layers.27.self_attn.v_proj: 6
mo

In [13]:
recorded_outputs["model.layers.13.self_attn.v_proj"][2].shape

torch.Size([6, 2560])

In [14]:

final_tokens = {m: [] for m in target_modules}

for m, samples in recorded_outputs.items():
    for activations in samples:
        last_token = activations[-1]
        final_tokens[m].append(last_token)

In [15]:
import torch
import torch.nn as nn

In [16]:
for k in final_tokens.keys():
    final_tokens[k] = torch.stack(final_tokens[k])

In [17]:
final_tokens["model.layers.19.self_attn.v_proj"].shape

torch.Size([6, 2560])

In [18]:
def principal_component(data, n_components):
    data = data - data.mean(dim=0)
    data = data.float()
    U, S, V = torch.linalg.svd(data)
    steer = V[:,:n_components]
    return steer 

In [19]:
steer_vectors = {}
for m, tokens in final_tokens.items():
    steer = principal_component(tokens, 1)
    steer_vectors[m] = steer.view(1,-1)

  U, S, V = torch.linalg.svd(data)


In [20]:
steer_vectors

{'model.layers.0.self_attn.v_proj': tensor([[-0.0119,  0.0109, -0.0121,  ..., -0.0029, -0.0040,  0.0045]],
        device='mps:0', grad_fn=<ViewBackward0>),
 'model.layers.1.self_attn.v_proj': tensor([[-0.0213,  0.0288,  0.0082,  ...,  0.0126,  0.0155,  0.0214]],
        device='mps:0', grad_fn=<ViewBackward0>),
 'model.layers.2.self_attn.v_proj': tensor([[ 0.0116,  0.0221,  0.0303,  ...,  0.0167, -0.0056,  0.0044]],
        device='mps:0', grad_fn=<ViewBackward0>),
 'model.layers.3.self_attn.v_proj': tensor([[-0.0074,  0.0011,  0.0090,  ..., -0.0228, -0.0062,  0.0028]],
        device='mps:0', grad_fn=<ViewBackward0>),
 'model.layers.4.self_attn.v_proj': tensor([[-0.0215, -0.0158, -0.0124,  ..., -0.0150, -0.0015,  0.0266]],
        device='mps:0', grad_fn=<ViewBackward0>),
 'model.layers.5.self_attn.v_proj': tensor([[0.0103, 0.0123, 0.0191,  ..., 0.0062, 0.0069, 0.0089]],
        device='mps:0', grad_fn=<ViewBackward0>),
 'model.layers.6.self_attn.v_proj': tensor([[ 0.0181, -0.0333, -

In [21]:
class SteerWrapper(nn.Module):
    def __init__(self, layer, steer_vector, steer_magnitude):
        super(SteerWrapper, self).__init__()
        self.layer = layer
        self.steer = steer_vector
        self.steer_magnitude = steer_magnitude

    def forward(self, x):
        x = self.layer(x)
        return x + self.steer_vector * self.steer_magnitude

In [22]:
from model_utils import replace_submodule

In [24]:
for m, vector in steer_vectors.items():
    old_module = get_submodule(model, m)
    new_module = SteerWrapper(old_module, vector, 0.5)
    replace_submodule(model, m, new_module)

In [15]:
import torch

In [16]:
torch.stack(recorded_outputs["model.layers.13.self_attn.v_proj"]).shape
# n_tokens, n_outputs

torch.Size([4, 7, 2560])

In [17]:
input_tokens["attention_mask"].shape

torch.Size([4, 7])

In [18]:
def principal_components(data, n_components):
    data = data - data.mean(dim=0)
    U, S, V = torch.linalg.svd(data)
    proj = V[:,:n_components]
    return proj
