In [1]:
# generic imports
import torch
torch.set_grad_enabled(False)

# Utilities
from general_utils import (
  ModelAndTokenizer,
  make_inputs,
  decode_tokens,
)
from patchscopes_utils import *

In [2]:
# loading the model
model_to_hook = {
    "EleutherAI/pythia-12b": set_hs_patch_hooks_neox,
    "meta-llama/Llama-2-13b-chat-hf": set_hs_patch_hooks_llama,
    "./stable-vicuna-13b": set_hs_patch_hooks_llama,
    "EleutherAI/gpt-j-6b": set_hs_patch_hooks_gptj
}

CURRENT_LLM = "meta-llama/Llama-2-13b-chat-hf"

model_name = CURRENT_LLM

if "13b" in model_name or "12b" in model_name:
    torch_dtype = torch.float16
else:
    torch_dtype = None

my_device = torch.device("cuda:0")

mt = ModelAndTokenizer(
    model_name,
    low_cpu_mem_usage=False,
    torch_dtype=torch_dtype,
    device=my_device,
)

mt.set_hs_patch_hooks = model_to_hook[model_name]
mt.model.eval()

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_

In [3]:
def generate_and_extract_intermediate_values(prompt):
    """
    Running the model and generating intermediate representations in each layer for:
    1. residuals pre-mlp
    2. mlp outputs
    3. hidden states
    """
    store_hooks = []
    residual_pre_mlp_cache_ = []
    mlp_cache_ = []

    input_ids = make_inputs(mt.tokenizer, [prompt], device=mt.device)

    def store_mlp_hook(module, input, output):
        residual_pre_mlp_cache_.append(input[0][0])
        mlp_cache_.append(output[0])

    for layer in mt.model.model.layers:
        store_hooks.append(layer.mlp.register_forward_hook(store_mlp_hook))

    generated = mt.model(**input_ids, output_hidden_states=True)

    hs_cache_ = [
        generated["hidden_states"][layer + 1][0] for layer in range(mt.num_layers)
    ]

    remove_hooks(store_hooks)

    return residual_pre_mlp_cache_, mlp_cache_, hs_cache_

def patchscope_interpret(vec, target_layer=0):
    """Interpretation of vectors using Patchscopes technique."""
    target_prompt = "Syria: Country in the Middle East, Leonardo DiCaprio: American actor, Samsung: South Korean multinational major appliance and consumer electronics corporation, x"
    
    # last token within target prompt
    target_idx = -1

    patch_config = {
        target_layer: [(target_idx, vec)]
    }

    patch_hooks = mt.set_hs_patch_hooks(
        mt.model, patch_config, module="hs", patch_input=False, generation_mode=True,
    )

    inp = make_inputs(mt.tokenizer, [target_prompt], device=mt.device)

    seq_len = len(inp["input_ids"][0])
    max_token_to_produce = 10
    output_toks = mt.model.generate(
        inp["input_ids"],
        max_length=seq_len + max_token_to_produce,
        pad_token_id=mt.model.generation_config.eos_token_id,
    )
    
    remove_hooks(patch_hooks)

    generations_patched =  mt.tokenizer.decode(output_toks[0][len(inp["input_ids"][0]):])
    
    return generations_patched

def find_token_id(prompt, token):
    """finding the offset of a specific token within a prompt."""
    inp = make_inputs(mt.tokenizer, [prompt], device=mt.device)
    decoded = decode_tokens(mt.tokenizer, inp['input_ids'])[0]

    return decoded.index(token)


# MLP Interpretation Experiment
We show the following steps performed on each example:
1. Running the LLM model to generate residual pre-MLP, MLP output and hidden state representations for each layer.
2. Using the hidden states, we find the layer in which the MLP output provides enough contextualization for the hidden state to interpret as the context of the sentence. In this part we will also observe the MLP outputs yielding meaningless results as a whole.
4. Obtaining the meaning of the MLP outputs using Superscopes amplification, starting from the contextualized layer or even a layer prior in some cases.

In [4]:
# test number 1 - ("Diana, Princess of Wales", "Wales") MLP interpretation
residual_pre_mlp_cache, mlp_outputs_cache, hs_cache = generate_and_extract_intermediate_values("Diana, Princess of Wales")

source_position = find_token_id("Diana, Princess of Wales", "Wales")

for layer in range(1, 5):
    print()
    print(f"Layer {layer}")
    print()
    print(f"Residual pre-MLP result: {patchscope_interpret(residual_pre_mlp_cache[layer][source_position])}")
    print()
    print(f"MLP Output result: {patchscope_interpret(mlp_outputs_cache[layer][source_position])}")
    print()
    print(f"Hidden State result: {patchscope_interpret(hs_cache[layer][source_position])}")
    print()


Layer 1

Residual pre-MLP result: : Country in the United Kingdom, Celine D

MLP Output result: : Aramaic language, which was the language

Hidden State result: : Country in Great Britain, Tesla:


Layer 2

Residual pre-MLP result: : Country in the United Kingdom, Jake Gy

MLP Output result: lywood: Informal term for the Hollywood film industry

Hidden State result: : Country in the United Kingdom, United Nations:


Layer 3

Residual pre-MLP result: : Country in the United Kingdom, Zara:

MLP Output result: , J.J. Watt: American football

Hidden State result: : Title of honorific for the British royal family


Layer 4

Residual pre-MLP result: : A title of nobility, etc.


MLP Output result: , Tesla: American electric vehicle and clean

Hidden State result: : Title of honorific for the wife of a



### Residual pre-MLP, MLP Outputs, Hidden State - Results Explanation

As one can simply see, the token resembles into a royalty-related contextualized token after layer 3 (inclusive).

We can also see that the MLP Output has no meaning here at all, in the following part we are going to amplify the MLP output and extract its meaning, starting from a layer prior to 3.

In [5]:
realization_layer = 3

for layer in range(realization_layer - 1, realization_layer + 2):
    print(f"Layer {layer}")
    for amp in range(3, 18, 3):
        print()
        print(f"MLP Output (Amp={amp}) result: {patchscope_interpret(mlp_outputs_cache[layer][source_position] * amp, target_layer=layer)}")
    print()
    print()

Layer 2

MLP Output (Amp=3) result: : The fourth letter of the alphabet, Pink

MLP Output (Amp=6) result: : English county, Adele: British singer

MLP Output (Amp=9) result: : British prince, Harry: American singer-song

MLP Output (Amp=12) result: : A letter that is used to form words,

MLP Output (Amp=15) result: : A prince of the United Kingdom, Mery


Layer 3

MLP Output (Amp=3) result: : British singer-songwriter and actress, G

MLP Output (Amp=6) result: : English singer-songwriter.

Th

MLP Output (Amp=9) result: : Australian rock band, and Ebola:

MLP Output (Amp=12) result: : Prime Minister of Australia, BTS: South

MLP Output (Amp=15) result: : A colloquial term for a susp


Layer 4

MLP Output (Amp=3) result: : Title of respect for a woman, particularly a

MLP Output (Amp=6) result: : Title of honorific address for a married woman

MLP Output (Amp=9) result: : A title of honorific, typically used for

MLP Output (Amp=12) result: : The wife of a king or queen.


MLP Output (A

### Results Analysis
As one can simply see, we get british and royalty related results starting from layer 2, obtaining our objective.

In [6]:
# test number 2 - "Back to the Future" - "Future" token MLP interpretation
residual_pre_mlp_cache, mlp_outputs_cache, hs_cache = generate_and_extract_intermediate_values("Back to the Future")

source_position = find_token_id("Back to the Future", "Future")

for layer in range(1, 4):
    print()
    print(f"Layer {layer}")
    print()
    print(f"Residual pre-MLP result: {patchscope_interpret(residual_pre_mlp_cache[layer][source_position])}")
    print()
    print(f"MLP Output result: {patchscope_interpret(mlp_outputs_cache[layer][source_position])}")
    print()
    print(f"Hidden State result: {patchscope_interpret(hs_cache[layer][source_position])}")
    print()


Layer 1

Residual pre-MLP result: : Time period after present, M&M'

MLP Output result: : Time period after the present, The Reven

Hidden State result: : Ahead of one's time, innov


Layer 2

Residual pre-MLP result: : concept of time, Glossary of religious

MLP Output result: : These are just a few examples of what a

Hidden State result: : Science fiction film trilogy, and the


Layer 3

Residual pre-MLP result: : 2015 science fiction film directed

MLP Output result: : The word or phrase that best fits each blank

Hidden State result: : 1985 science fiction film,



### Residual pre-MLP, MLP Outputs, Hidden State - Results Explanation

As one can simply see, the token resembles into a science-fiction movie contextualized token after layer 2 (inclusive).

In the following part we are going to amplify the MLP output and extract its meaning, starting from a layer prior to 2.


In [7]:
realization_layer = 2

for layer in range(realization_layer - 1, realization_layer + 2):
    print(f"Layer {layer}")
    for amp in range(3, 18, 3):
        print()
        print(f"MLP Output (Amp={amp}) result: {patchscope_interpret(mlp_outputs_cache[layer][source_position] * amp, target_layer=layer)}")
    print()
    print()

Layer 1

MLP Output (Amp=3) result: : Time period after the present, The Reven

MLP Output (Amp=6) result: : Time period after present, The Revenant

MLP Output (Amp=9) result: : A time period after the present, Nost

MLP Output (Amp=12) result: : Time period after the present, The Reven

MLP Output (Amp=15) result: : A period of time coming after the present,


Layer 2

MLP Output (Amp=3) result: : 2015 film starring Di

MLP Output (Amp=6) result: : 2006 science fiction film directed

MLP Output (Amp=9) result: : 1985 film, The Mart

MLP Output (Amp=12) result: : 1997 American science fiction film

MLP Output (Amp=15) result: : 1997 film trilogy


Layer 3

MLP Output (Amp=3) result: : 2009 science fiction film directed

MLP Output (Amp=6) result: : American rock band, and Pixar:

MLP Output (Amp=9) result: : 2004 science fiction film directed

MLP Output (Amp=12) result: : 1995 science fiction film directed

MLP Output (Amp=15) result: : 1995 science fiction film directed




### Results Analysis
As one can simply see, we get science fiction movie related results starting from layer 2, obtaining our objective.

In [14]:
# test number 3 - "Saturday Night Live" - "Live" token MLP interpretation
residual_pre_mlp_cache, mlp_outputs_cache, hs_cache = generate_and_extract_intermediate_values("Saturday Night Live")

source_position = find_token_id("Saturday Night Live", "Live")

for layer in range(2, 5):
    print()
    print(f"Layer {layer}")
    print()
    print(f"Residual pre-MLP result: {patchscope_interpret(residual_pre_mlp_cache[layer][source_position])}")
    print()
    print(f"MLP Output result: {patchscope_interpret(mlp_outputs_cache[layer][source_position])}")
    print()
    print(f"Hidden State result: {patchscope_interpret(hs_cache[layer][source_position])}")
    print()


Layer 2

Residual pre-MLP result: : To be alive, Nokia: Finn

MLP Output result: : shortened form of the word "privile

Hidden State result: Aid: A charity single recorded by a


Layer 3

Residual pre-MLP result: : A live television show, Oscar: Award given

MLP Output result: ater: a large, deep hollow in the

Hidden State result: : American sketch comedy and variety show, Dway


Layer 4

Residual pre-MLP result: : These are just a few examples of things that

MLP Output result: : What do these three things have in common?

Hidden State result: : American sketch comedy and variety show, Spon



### Residual pre-MLP, MLP Outputs, Hidden State - Results Explanation

As one can simply see, the token resembles into an American sketch comedy contextualized token after layer 3 (inclusive).

In the following part we are going to amplify the MLP output and extract its meaning, starting from a layer prior to 3.

Please note that all MLP outputs yielded nonsense.


In [13]:
realization_layer = 3

for layer in range(realization_layer - 1, realization_layer + 2):
    print(f"Layer {layer}")
    for amp in range(3, 18, 3):
        print()
        print(f"MLP Output (Amp={amp}) result: {patchscope_interpret(mlp_outputs_cache[layer][source_position] * amp, target_layer=0)}")
    print()
    print()

Layer 2

MLP Output (Amp=3) result: : A live television variety show, Titanic

MLP Output (Amp=6) result: : type of television programming, Eminem:

MLP Output (Amp=9) result: : Social media platform, Wakanda: F

MLP Output (Amp=12) result: : a popular video-sharing platform, The

MLP Output (Amp=15) result: : A live television variety show, The Lion King


Layer 3

MLP Output (Amp=3) result: , Tesla: American electric vehicle and clean

MLP Output (Amp=6) result: , and Instagram: photo and video-sh

MLP Output (Amp=9) result: , Meryl Streep: American actress

MLP Output (Amp=12) result: : What do these three things have in common?

MLP Output (Amp=15) result: , Nike: American multinational corporation


Layer 4

MLP Output (Amp=3) result: , and Easter: Christian holiday that celebr

MLP Output (Amp=6) result: :

* Leonardo DiCaprio is

MLP Output (Amp=9) result: : Sketch comedy television series, and more


MLP Output (Amp=12) result: : Satirical comedy, The Lion King:

MLP Output (Am

### Results Analysis
As one can simply see, we get television, performance and American related results starting from layer 2, obtaining our objective.
Please also note that Layer 4 amplified by 9 even yielded the entire context of the sentence.