In [None]:
import os

try:
    import google.colab

    IN_COLAB = True
    print("Running as a Colab notebook")

except:
    IN_COLAB = False
    print("Running as a Jupyter notebook - intended for development only!")
    from IPython import get_ipython

    ipython = get_ipython()
    # Code to automatically update the HookedTransformer code as its edited without restarting the kernel
    ipython.magic("load_ext autoreload")
    ipython.magic("autoreload 2")

Running as a Colab notebook


In [None]:
if IN_COLAB:
    os.system("pip install transformer_lens")
    os.system("pip install circuitsvis")
    os.system("pip install gradio")

In [None]:
import torch
from transformer_lens import HookedTransformer
from typing import Dict, Union, List
import gradio as gr
from transformer_lens.utils import to_numpy
from IPython.display import HTML

In [None]:
torch.set_grad_enabled(False)  # save memory
model = HookedTransformer.from_pretrained("gpt2-xl")
model.eval()
if torch.cuda.is_available():
    model.to('cuda')

Downloading (…)lve/main/config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-xl into HookedTransformer
Moving model to device:  cuda


Parameter Names
Here is a list of the parameters and shapes in the model. By convention, all weight matrices multiply on the right (ie new_activation = old_activation @ weights + bias).

Reminder of the key hyper-params:
- n_layers: 48. The number of transformer blocks in the model (a block contains an attention layer and an MLP layer)
- n_heads: 25. The number of attention heads per attention layer
- d_model: 1600. The residual stream width.
- d_head: 64. The internal dimension of an attention head activation.
- d_mlp: 6400. The internal dimension of the MLP layers (ie the number of neurons).
- d_vocab: 50267. The number of tokens in the vocabulary.
- n_ctx: 1024. The maximum number of tokens in an input prompt.

In [None]:
for name, param in model.named_parameters():
    if name.startswith("blocks.0."):
        print(name, param.shape)

blocks.0.attn.W_Q torch.Size([25, 1600, 64])
blocks.0.attn.W_K torch.Size([25, 1600, 64])
blocks.0.attn.W_V torch.Size([25, 1600, 64])
blocks.0.attn.W_O torch.Size([25, 64, 1600])
blocks.0.attn.b_Q torch.Size([25, 64])
blocks.0.attn.b_K torch.Size([25, 64])
blocks.0.attn.b_V torch.Size([25, 64])
blocks.0.attn.b_O torch.Size([1600])
blocks.0.mlp.W_in torch.Size([1600, 6400])
blocks.0.mlp.b_in torch.Size([6400])
blocks.0.mlp.W_out torch.Size([6400, 1600])
blocks.0.mlp.b_out torch.Size([1600])


In [None]:
for name, param in model.named_parameters():
    if not name.startswith("blocks"):
        print(name, param.shape)

embed.W_E torch.Size([50257, 1600])
pos_embed.W_pos torch.Size([1024, 1600])
unembed.W_U torch.Size([1600, 50257])
unembed.b_U torch.Size([50257])


In [None]:
def get_neuron_acts(text, layer, neuron_index):
    # Hacky way to get out state from a single hook - we have a single element list and edit that list within the hook.
    cache = {}

    def caching_hook(act, hook):
        cache["activation"] = act[0, :, neuron_index]

    model.run_with_hooks(
        text, fwd_hooks=[(f"blocks.{layer}.mlp.hook_post", caching_hook)]
    )
    return to_numpy(cache["activation"])

In [None]:
gpt2_text = 'Public Inc. announced the launch of Sit Kicker. Sit Kicker is a nationwide initiative focused on encouraging Canadians who work in office settings to reduce sedentary behaviour and "kick the sit" out of their work habits by shifting workplace culture towards more stand-friendly physical work environments. The Public Health Agency of Canada is providing'
gpt2_tokens = model.to_tokens(gpt2_text)

In [None]:
default_layer = 21
default_neuron_index = 2932
print(model.to_str_tokens(gpt2_text))
print(get_neuron_acts(gpt2_text, default_layer, default_neuron_index))

['<|endoftext|>', 'Public', ' Inc', '.', ' announced', ' the', ' launch', ' of', ' Sit', ' Kick', 'er', '.', ' Sit', ' Kick', 'er', ' is', ' a', ' nationwide', ' initiative', ' focused', ' on', ' encouraging', ' Canadians', ' who', ' work', ' in', ' office', ' settings', ' to', ' reduce', ' sed', 'entary', ' behaviour', ' and', ' "', 'kick', ' the', ' sit', '"', ' out', ' of', ' their', ' work', ' habits', ' by', ' shifting', ' workplace', ' culture', ' towards', ' more', ' stand', '-', 'friendly', ' physical', ' work', ' environments', '.', ' The', ' Public', ' Health', ' Agency', ' of', ' Canada', ' is', ' providing']
[ 3.0345567e-02 -1.2449749e-01  6.3414079e-01  1.1684608e+00
 -1.7001133e-01 -8.6407743e-02 -7.8891933e-02 -4.7523014e-02
 -1.2944321e-01 -1.2590133e-01 -7.2261885e-02 -1.6021019e-01
  9.0067488e-01 -1.6459474e-01 -4.6006512e-02 -5.9493408e-02
 -5.7881549e-02 -3.8879093e-02 -1.6462146e-01 -4.6243098e-02
 -2.6917210e-02 -2.9468775e-02  7.4035578e+00 -2.8852530e-02
 -3.68

In [None]:
# This is some CSS (tells us what style )to give each token a thin gray border, to make it easy to see token separation
style_string = """<style> 
    span.token {
        border: 1px solid rgb(123, 123, 123)
        } 
    </style>"""


def calculate_color(val, max_val, min_val):
    # Hacky code that takes in a value val in range [min_val, max_val], normalizes it to [0, 1] and returns a color which interpolates between slightly off-white and red (0 = white, 1 = red)
    # We return a string of the form "rgb(240, 240, 240)" which is a color CSS knows
    normalized_val = (val - min_val) / max_val
    return f"rgb(240, {240*(1-normalized_val)}, {240*(1-normalized_val)})"


def basic_neuron_vis(text, layer, neuron_index, max_val=None, min_val=None):
    """
    text: The text to visualize
    layer: The layer index
    neuron_index: The neuron index
    max_val: The top end of our activation range, defaults to the maximum activation
    min_val: The top end of our activation range, defaults to the minimum activation

    Returns a string of HTML that displays the text with each token colored according to its activation

    Note: It's useful to be able to input a fixed max_val and min_val, because otherwise the colors will change as you edit the text, which is annoying.
    """
    if layer is None:
        return "Please select a Layer"
    if neuron_index is None:
        return "Please select a Neuron"
    acts = get_neuron_acts(text, layer, neuron_index)
    act_max = acts.max()
    act_min = acts.min()
    # Defaults to the max and min of the activations
    if max_val is None:
        max_val = act_max
    if min_val is None:
        min_val = act_min
    # We want to make a list of HTML strings to concatenate into our final HTML string
    # We first add the style to make each token element have a nice border
    htmls = [style_string]
    # We then add some text to tell us what layer and neuron we're looking at - we're just dealing with strings and can use f-strings as normal
    # h4 means "small heading"
    htmls.append(f"<h4>Layer: <b>{layer}</b>. Neuron Index: <b>{neuron_index}</b></h4>")
    # We then add a line telling us the limits of our range
    htmls.append(
        f"<h4>Max Range: <b>{max_val:.4f}</b>. Min Range: <b>{min_val:.4f}</b></h4>"
    )
    # If we added a custom range, print a line telling us the range of our activations too.
    if act_max != max_val or act_min != min_val:
        htmls.append(
            f"<h4>Custom Range Set. Max Act: <b>{act_max:.4f}</b>. Min Act: <b>{act_min:.4f}</b></h4>"
        )
    # Convert the text to a list of tokens
    str_tokens = model.to_str_tokens(text)
    for tok, act in zip(str_tokens, acts):
        # A span is an HTML element that lets us style a part of a string (and remains on the same line by default)
        # We set the background color of the span to be the color we calculated from the activation
        # We set the contents of the span to be the token
        htmls.append(
            f"<span class='token' style='background-color:{calculate_color(act, max_val, min_val)}' >{tok}</span>"
        )

    return "".join(htmls)

In [None]:
# The function outputs a string of HTML
default_max_val = 4.0
default_min_val = 0.0
default_html_string = basic_neuron_vis(
    gpt2_text,
    default_layer,
    default_neuron_index,
    max_val=default_max_val,
    min_val=default_min_val,
)

# IPython lets us display HTML
print("Displayed HTML")
display(HTML(default_html_string))

# We can also print the string directly
print("HTML String - it's just raw HTML code!")
print(default_html_string)

Displayed HTML


HTML String - it's just raw HTML code!
<style> 
    span.token {
        border: 1px solid rgb(123, 123, 123)
        } 
    </style><h4>Layer: <b>21</b>. Neuron Index: <b>2932</b></h4><h4>Max Range: <b>4.0000</b>. Min Range: <b>0.0000</b></h4><h4>Custom Range Set. Max Act: <b>7.4036</b>. Min Act: <b>-0.1700</b></h4><span class='token' style='background-color:rgb(240, 238.17926600575447, 238.17926600575447)' ><|endoftext|></span><span class='token' style='background-color:rgb(240, 247.4698492884636, 247.4698492884636)' >Public</span><span class='token' style='background-color:rgb(240, 201.95155262947083, 201.95155262947083)' > Inc</span><span class='token' style='background-color:rgb(240, 169.89234924316406, 169.89234924316406)' >.</span><span class='token' style='background-color:rgb(240, 250.2006796002388, 250.2006796002388)' > announced</span><span class='token' style='background-color:rgb(240, 245.1844646036625, 245.1844646036625)' > the</span><span class='token' style='background-

In [None]:
gpt2_logits, gpt2_cache = model.run_with_cache(gpt2_tokens, remove_batch_dim=True)

In [None]:
gpt2_cache

ActivationCache with keys ['hook_embed', 'hook_pos_embed', 'blocks.0.hook_resid_pre', 'blocks.0.ln1.hook_scale', 'blocks.0.ln1.hook_normalized', 'blocks.0.attn.hook_q', 'blocks.0.attn.hook_k', 'blocks.0.attn.hook_v', 'blocks.0.attn.hook_attn_scores', 'blocks.0.attn.hook_pattern', 'blocks.0.attn.hook_z', 'blocks.0.hook_attn_out', 'blocks.0.hook_resid_mid', 'blocks.0.ln2.hook_scale', 'blocks.0.ln2.hook_normalized', 'blocks.0.mlp.hook_pre', 'blocks.0.mlp.hook_post', 'blocks.0.hook_mlp_out', 'blocks.0.hook_resid_post', 'blocks.1.hook_resid_pre', 'blocks.1.ln1.hook_scale', 'blocks.1.ln1.hook_normalized', 'blocks.1.attn.hook_q', 'blocks.1.attn.hook_k', 'blocks.1.attn.hook_v', 'blocks.1.attn.hook_attn_scores', 'blocks.1.attn.hook_pattern', 'blocks.1.attn.hook_z', 'blocks.1.hook_attn_out', 'blocks.1.hook_resid_mid', 'blocks.1.ln2.hook_scale', 'blocks.1.ln2.hook_normalized', 'blocks.1.mlp.hook_pre', 'blocks.1.mlp.hook_post', 'blocks.1.hook_mlp_out', 'blocks.1.hook_resid_post', 'blocks.2.hook_re

In [None]:
n_layers = 48
d_mlp = 6400

In [None]:
gpt2_str_tokens = model.to_str_tokens(gpt2_text)
print(gpt2_str_tokens)

['<|endoftext|>', 'Public', ' Inc', '.', ' announced', ' the', ' launch', ' of', ' Sit', ' Kick', 'er', '.', ' Sit', ' Kick', 'er', ' is', ' a', ' nationwide', ' initiative', ' focused', ' on', ' encouraging', ' Canadians', ' who', ' work', ' in', ' office', ' settings', ' to', ' reduce', ' sed', 'entary', ' behaviour', ' and', ' "', 'kick', ' the', ' sit', '"', ' out', ' of', ' their', ' work', ' habits', ' by', ' shifting', ' workplace', ' culture', ' towards', ' more', ' stand', '-', 'friendly', ' physical', ' work', ' environments', '.', ' The', ' Public', ' Health', ' Agency', ' of', ' Canada', ' is', ' providing']


In [None]:
n_layers = model.cfg.n_layers
d_mlp = model.cfg.d_mlp
n_positions = len(gpt2_str_tokens)

In [None]:
def get_neuron_acts(text, layer, neuron_index):
    # Hacky way to get out state from a single hook - we have a single element list and edit that list within the hook.
    cache = {}

    def caching_hook(act, hook):
        cache["activation"] = act[0, :, neuron_index]

    model.run_with_hooks(
        text, fwd_hooks=[(f"blocks.{layer}.mlp.hook_post", caching_hook)]
    )
    return to_numpy(cache["activation"])

In [None]:
from einops import rearrange

def get_all_mlp_neuron_acts(text):
    # Hacky way to get out state from a single hook - we have a single element list and edit that list within the hook.
    cache = {}

    n_layers = model.cfg.n_layers
    d_mlp = model.cfg.d_mlp
    n_positions = len(model.to_str_tokens(text))

    def caching_hook(act, hook):
        cache["activation"] = act[0, :, :]

    activation_shape = (n_layers, n_positions, d_mlp)
    activations = torch.zeros(activation_shape, device="cuda", dtype=torch.float32)

    for layer in range(n_layers):
        model.run_with_hooks(
            text, fwd_hooks = [
                (f"blocks.{layer}.mlp.hook_post", caching_hook)
            ]
        )
        activations[layer, :, :] = cache["activation"]

    # n_layers (48) x n_neurons (6400) x n_positions (length of text token)
    activations = rearrange(activations, 'l p d -> l d p')

    return to_numpy(activations)

In [None]:
all_mlp_activations = get_all_mlp_neuron_acts(gpt2_text)
all_mlp_activations.shape

(48, 6400, 65)

In [None]:
all_mlp_activations[0].shape

(6400, 65)

In [None]:
default_layer = 21
default_neuron_index = 2932
print(model.to_str_tokens(gpt2_text))
print(get_neuron_acts(gpt2_text, default_layer, default_neuron_index))

['<|endoftext|>', 'Public', ' Inc', '.', ' announced', ' the', ' launch', ' of', ' Sit', ' Kick', 'er', '.', ' Sit', ' Kick', 'er', ' is', ' a', ' nationwide', ' initiative', ' focused', ' on', ' encouraging', ' Canadians', ' who', ' work', ' in', ' office', ' settings', ' to', ' reduce', ' sed', 'entary', ' behaviour', ' and', ' "', 'kick', ' the', ' sit', '"', ' out', ' of', ' their', ' work', ' habits', ' by', ' shifting', ' workplace', ' culture', ' towards', ' more', ' stand', '-', 'friendly', ' physical', ' work', ' environments', '.', ' The', ' Public', ' Health', ' Agency', ' of', ' Canada', ' is', ' providing']
[ 3.0345567e-02 -1.2449749e-01  6.3414079e-01  1.1684608e+00
 -1.7001133e-01 -8.6407743e-02 -7.8891933e-02 -4.7523014e-02
 -1.2944321e-01 -1.2590133e-01 -7.2261885e-02 -1.6021019e-01
  9.0067488e-01 -1.6459474e-01 -4.6006512e-02 -5.9493408e-02
 -5.7881549e-02 -3.8879093e-02 -1.6462146e-01 -4.6243098e-02
 -2.6917210e-02 -2.9468775e-02  7.4035578e+00 -2.8852530e-02
 -3.68

In [None]:
all_mlp_activations[21, 2932, :]

array([ 3.0345567e-02, -1.2449749e-01,  6.3414079e-01,  1.1684608e+00,
       -1.7001133e-01, -8.6407743e-02, -7.8891933e-02, -4.7523014e-02,
       -1.2944321e-01, -1.2590133e-01, -7.2261885e-02, -1.6021019e-01,
        9.0067488e-01, -1.6459474e-01, -4.6006512e-02, -5.9493408e-02,
       -5.7881549e-02, -3.8879093e-02, -1.6462146e-01, -4.6243098e-02,
       -2.6917210e-02, -2.9468775e-02,  7.4035578e+00, -2.8852530e-02,
       -3.6855445e-03, -1.5870464e-03, -1.2615668e-02, -3.3700727e-02,
       -4.9219999e-02, -1.4580110e-01, -1.6823016e-01, -1.6890118e-01,
        2.1692955e+00, -1.1603283e-02, -1.5672741e-03, -1.1570438e-02,
       -9.8077348e-03, -1.6887696e-01, -1.3756633e-02, -1.2688768e-06,
       -2.1230393e-05, -1.5142341e-03, -2.0473195e-02, -1.6957280e-01,
       -4.4387090e-03, -1.2596864e-01, -1.0008973e-01, -1.6409762e-01,
       -7.8130469e-02, -2.6475465e-02, -6.4075842e-02, -1.2644984e-01,
       -1.5964326e-01, -1.3119545e-02, -6.6182002e-02, -6.5837346e-02,
      

In [None]:
all_mlp_activations[21, 2932, :] == get_neuron_acts(gpt2_text, default_layer, default_neuron_index)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])