In [1]:
import os
DEV_MODE = True
try:
    import google.colab
    IN_COLAB = True
    print("Running as a Colab notebook")
except:
    IN_COLAB = False
# Install if in Colab
if IN_COLAB:
    %pip install transformer_lens
    %pip install circuitsvis
    # Install a faster Node version
    !curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash -; sudo apt-get install -y nodejs  # noqa

# Hot reload in development mode & not running on the CD
if not IN_COLAB:
    from IPython import get_ipython
    ip = get_ipython()
    if not ip.extension_manager.loaded:
        ip.extension_manager.load('autoreload')
        %autoreload 2
        
IN_GITHUB = os.getenv("GITHUB_ACTIONS") == "true"

# change renderer to colab if needed
import plotly.io as pio
if IN_COLAB or not DEV_MODE:
    pio.renderers.default = "colab"
else:
    pio.renderers.default = "notebook_connected"
    
print(f"Using renderer: {pio.renderers.default}")

# import circuit vis
import circuitsvis as cv
# Testing that the library works
cv.examples.hello("Neel")

import warnings
warnings.filterwarnings("ignore")


Using renderer: notebook_connected


In [2]:
# Main imports
import torch
import torch.nn as nn
import einops
from fancy_einsum import einsum
import tqdm.auto as tqdm
import plotly.express as px

from jaxtyping import Float
from functools import partial

# transformer lens stuff
import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookPoint,
)  # Hooking utilities
from transformer_lens import HookedTransformer, FactoredMatrix


In [3]:
# set grad to false cuz we dont need to train
torch.set_grad_enabled(False)


<torch.autograd.grad_mode.set_grad_enabled at 0x10f4725e0>

In [53]:
# some plotting functions

# heatmap
def imshow(tensor, renderer=None, xaxis="", yaxis="", **kwargs):
    px.imshow(utils.to_numpy(tensor), color_continuous_midpoint=0.0, color_continuous_scale="RdBu", labels={"x":xaxis, "y":yaxis}, **kwargs).show(renderer=renderer)

def line(tensor, renderer=None, xaxis="", yaxis="", **kwargs):
    px.line(utils.to_numpy(tensor), labels={"x":xaxis, "y":yaxis}, **kwargs).show(renderer)

def scatter(x, y, xaxis="", yaxis="", caxis="", renderer=None, **kwargs):
    x = utils.to_numpy(x)
    y = utils.to_numpy(y)
    px.scatter(y=y, x=x, labels={"x":xaxis, "y":yaxis, "color":caxis}, **kwargs).show(renderer)


In [5]:
# set device
device = utils.get_device()
print(f"Using device: {device}")


Using device: mps


# Loading models

In [6]:
model = HookedTransformer.from_pretrained("gpt2-small", device=device)


Loaded pretrained model gpt2-small into HookedTransformer


# return types:

- logits: returns logits of shape B x Pos x d_model
- loss: return CE loss on next token
- both: returns tuple of (logit, loss)
- None: runs model but doesnt return, useful for activations

In [7]:
# run example on harry potter first text
random_text = """
Mr and Mrs Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.
"""

logits, loss = model(random_text, return_type="both")
print(f"model_loss: {loss}")
print(f"model logits shape: {logits.shape}")



model_loss: 3.840034008026123
model logits shape: torch.Size([1, 64, 50257])


# Tokenization
- to_str_tokens(text): converts string into list of tokens as strings
- model.to_tokens(text): converts string to tensor of tokens 
- model.to_string(tokens): converts tensor of tokens to string

- you can remove bos by setting prepend_bos = False in to_tokens, to_str_tokens, model.forward


In [11]:
# example on tokenization
print(model.to_str_tokens("gpt2"))
print(model.to_str_tokens(["gpt2", "gpt2"]))
print(model.to_tokens("gpt2"))
print(model.to_string([50256, 70, 457, 17]))


['<|endoftext|>', 'g', 'pt', '2']
[['<|endoftext|>', 'g', 'pt', '2'], ['<|endoftext|>', 'g', 'pt', '2']]
tensor([[50256,    70,   457,    17]], device='mps:0')
<|endoftext|>gpt2


In [15]:
# Getting number of correct tokens
logits = model(random_text, return_type="logits")

# we dont calculate for last token
prediction = logits.argmax(dim=-1).squeeze()[:-1]
# skip first
true_tokens = model.to_tokens(random_text).squeeze()[1:]
is_correct = (prediction == true_tokens)

print(f"Model accuracy: {is_correct.sum()/len(true_tokens)}")
print(f"Correct tokens: {model.to_str_tokens(prediction[is_correct])}")


Model accuracy: 0.3333333432674408
Correct tokens: ['\n', ' Mrs', 'ley', ',', ',', ' Drive', ',', ' to', ' they', ' you', ' very', ' much', '.', ' were', ' to', ' in', ' they', '�', 't', '.', '\n']


# caching model activations
- we do that using run_with_cache method
- we can use remove_batch_dim=True to remove batch_dim when returing. Useful when we have 1 sentence input only

In [8]:
gpt2_text = "Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets."
gpt2_tokens = model.to_tokens(gpt2_text)
print(gpt2_tokens.device)
gpt2_logits, gpt2_cache = model.run_with_cache(gpt2_tokens, remove_batch_dim=True)


mps:0


In [9]:
gpt2_cache.keys()


dict_keys(['hook_embed', 'hook_pos_embed', 'blocks.0.hook_resid_pre', 'blocks.0.ln1.hook_scale', 'blocks.0.ln1.hook_normalized', 'blocks.0.attn.hook_q', 'blocks.0.attn.hook_k', 'blocks.0.attn.hook_v', 'blocks.0.attn.hook_attn_scores', 'blocks.0.attn.hook_pattern', 'blocks.0.attn.hook_z', 'blocks.0.hook_attn_out', 'blocks.0.hook_resid_mid', 'blocks.0.ln2.hook_scale', 'blocks.0.ln2.hook_normalized', 'blocks.0.mlp.hook_pre', 'blocks.0.mlp.hook_post', 'blocks.0.hook_mlp_out', 'blocks.0.hook_resid_post', 'blocks.1.hook_resid_pre', 'blocks.1.ln1.hook_scale', 'blocks.1.ln1.hook_normalized', 'blocks.1.attn.hook_q', 'blocks.1.attn.hook_k', 'blocks.1.attn.hook_v', 'blocks.1.attn.hook_attn_scores', 'blocks.1.attn.hook_pattern', 'blocks.1.attn.hook_z', 'blocks.1.hook_attn_out', 'blocks.1.hook_resid_mid', 'blocks.1.ln2.hook_scale', 'blocks.1.ln2.hook_normalized', 'blocks.1.mlp.hook_pre', 'blocks.1.mlp.hook_post', 'blocks.1.hook_mlp_out', 'blocks.1.hook_resid_post', 'blocks.2.hook_resid_pre', 'block

# visualize attention pattern
- get attention using gpt2_cache


In [18]:
# attention pattern of layer 0
attention_pattern = gpt2_cache["pattern", 0, "attn"]
print(attention_pattern.shape)
gpt2_str_tokens = model.to_str_tokens(gpt2_text)

# plot the attention pattern using circuitvis
print("Layer 0 Head Attention Patterns:")
display(cv.attention.attention_patterns(tokens=gpt2_str_tokens, 
                                attention=attention_pattern,
                                
))


torch.Size([12, 33, 33])
Layer 0 Head Attention Patterns:


# Ablations and interventions
- Every activation is surrounded by hook point.
- Hook function maps current_activation_value, hook point to new_activation_value.
-  model computes activations as normal while running, but hook function then takes that activation and replaces it with new one
- we run with hooks using run_with_hooks method

In [42]:
#  ablate specific head with certain value
def head_ablation_hook(
    attn_result: Float[torch.Tensor, "batch pos head_index d_head"],
    hook: HookPoint,
    head_index_to_ablate: int,
    value_to_ablate_with: float = 0.0,
) -> Float[torch.Tensor, "batch pos head_index d_head"]:
    
    print(f"Shape of value tensor: {attn_result.shape}")
    attn_result[:, :, head_index_to_ablate, : ] = value_to_ablate_with
    return attn_result


In [43]:
# test loss before ablating and after

head_index_to_ablate = 7
value_to_ablate_with = 0
layer_to_ablate = 1

original_loss = model(gpt2_text, return_type="loss")
hook_h_v = partial(head_ablation_hook, head_index_to_ablate=head_index_to_ablate, value_to_ablate_with=value_to_ablate_with)

ablated_loss = model.run_with_hooks(
    gpt2_text,
    return_type="loss",
    fwd_hooks=[(
        utils.get_act_name("pattern", layer_to_ablate),
        hook_h_v
    )]
)

print(f"Original Loss: {original_loss.item():.3f}")
print(f"Ablated Loss: {ablated_loss.item():.3f}")


Shape of value tensor: torch.Size([1, 12, 33, 33])
Original Loss: 3.999
Ablated Loss: 3.981


In [44]:
# clean things up
# model.reset_hooks()

# can add hooks to keep
# use add_perma_hook


# Activation patching

In [46]:
print(gpt2_logits.shape)


torch.Size([1, 33, 50257])


In [48]:
# function to calculate logit difference between correct and corrupted answer
def logits_to_logit_diff(logits, correct_answer=" John", incorrect_answer=" Mary"):
    
    correct_idx = model.to_single_token(correct_answer)
    incorrent_idx = model.to_single_token(incorrect_answer)
    return logits[0, -1, correct_idx] - logits[0, -1, incorrent_idx]

clean_prompt = "After John and Mary went to the store, Mary gave a bottle of milk to"
corrupted_prompt = "After John and Mary went to the store, John gave a bottle of milk to"

clean_tokens = model.to_tokens(clean_prompt)
corrupted_tokens = model.to_tokens(corrupted_prompt)

# We run on the clean prompt with the cache so we store activations to patch in later.
clean_logits, clean_cache = model.run_with_cache(clean_tokens)
clean_logit_diff = logits_to_logit_diff(clean_logits)
print(f"Clean logit difference: {clean_logit_diff.item():.3f}")

# We don't need to cache on the corrupted prompt.
corrupted_logits = model(corrupted_tokens)
corrupted_logit_diff = logits_to_logit_diff(corrupted_logits)
print(f"Corrupted logit difference: {corrupted_logit_diff.item():.3f}")


Clean logit difference: 4.276
Corrupted logit difference: -2.738


In [50]:
# Now we do activation patching on residual strea, for all layers

def residual_stream_patching_hook(
    resid_pre: Float[torch.Tensor, "batch pos d_model"],
    hook: HookPoint,
    pos: int,
) -> Float[torch.Tensor, "batch pos d_model"]:
    # each HookPoint hasd a name attribute giving the name of the hook
    clean_resid_pre = clean_cache[hook.name]
    # replace with clean
    resid_pre[:, pos, :] = clean_resid_pre[:, pos, :]
    return resid_pre


# We make a tensor to store the results for each patching run. We put it on the model's device to avoid needing to move things between the GPU and CPU, which can be slow.
num_positions = len(clean_tokens[0])
ioi_patching_result = torch.zeros((model.cfg.n_layers, 
                                   num_positions), 
                                  device=model.cfg.device)

# We run the patching for each layer.
for layer in tqdm.tqdm(range(model.cfg.n_layers), desc="Patching layers"):
    # We patch the residual stream for each position in the input.
    for pos in range(num_positions):
        # We patch the residual stream for the position.
        temp_hook = partial(residual_stream_patching_hook, pos=pos)
        patched_logits = model.run_with_hooks(
            corrupted_tokens,
            return_type="logits",
            fwd_hooks=[(
                utils.get_act_name("resid_pre", layer),
                temp_hook,
            )],
        )
        patched_logit_diff = logits_to_logit_diff(patched_logits).detach()
        # We calculate the logit difference for the patched position.
        ioi_patching_result[layer, pos] = (patched_logit_diff - corrupted_logit_diff)/(clean_logit_diff - corrupted_logit_diff)
        

Patching layers:   0%|          | 0/12 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [54]:
token_labels = [f"{token}_{idx}" for idx, token, in enumerate(model.to_str_tokens(clean_tokens))]
imshow(ioi_patching_result, x=token_labels, xaxis="Pos", yaxis="Layer", title="IOI Patching Result")


# Accessing activations using hooks

In [55]:
batch_sz = 10
seq_len = 50
size = (batch_sz, seq_len)
# We make a random tensor to use as input.
input_tensor = torch.randint(1000, 10000, size)

random_tokens = input_tensor.to(model.cfg.device)
repeated_tokens = einops.repeat(random_tokens, "b s -> b (2 s)")
repeated_logits = model(repeated_tokens)
correct_log_prob = model.loss_fn(repeated_logits, repeated_tokens, per_token=True)
loss_by_pos = einops.reduce(correct_log_prob, "b s -> s", "mean")
line(loss_by_pos, xaxis="Position", yaxis="Loss", title="Loss by Position on random repeated tokens")


# We can use hooks to calculate stuff on the run

In [60]:
# lets calculate induction score

# define a store
induction_score_store = torch.zeros((model.cfg.n_layers, model.cfg.n_heads), device=model.cfg.device)

# define a hook
def induction_score_hook(
    pattern: Float[torch.Tensor, "batch head_index dest_pos src_pos"],
    hook: HookPoint,
):
    # take diagonal of attention pattern
    print(f"{pattern.shape = }")
    induction_stripe = pattern.diagonal(dim1=-2, dim2=-1, offset=1-seq_len)
    print(f"{induction_stripe.shape = }")
    # get avg score per head
    induction_score = einops.reduce(induction_stripe ,"batch head_index pos -> head_index", "mean")
    # store the result
    induction_score_store[hook.layer(), :] = induction_score
    
# make boolean filter on activation names
pattern_hook_names_filter = lambda name: name.endswith("pattern")


model.run_with_hooks(
    repeated_tokens,
    return_type=None, # efficiency
    fwd_hooks=[(
        pattern_hook_names_filter,
        induction_score_hook
    )]
)

imshow(induction_score_store, xaxis="Head", yaxis="Layer", title="Induction Score")


pattern.shape = torch.Size([10, 12, 100, 100])
induction_stripe.shape = torch.Size([10, 12, 51])
pattern.shape = torch.Size([10, 12, 100, 100])
induction_stripe.shape = torch.Size([10, 12, 51])
pattern.shape = torch.Size([10, 12, 100, 100])
induction_stripe.shape = torch.Size([10, 12, 51])
pattern.shape = torch.Size([10, 12, 100, 100])
induction_stripe.shape = torch.Size([10, 12, 51])
pattern.shape = torch.Size([10, 12, 100, 100])
induction_stripe.shape = torch.Size([10, 12, 51])
pattern.shape = torch.Size([10, 12, 100, 100])
induction_stripe.shape = torch.Size([10, 12, 51])
pattern.shape = torch.Size([10, 12, 100, 100])
induction_stripe.shape = torch.Size([10, 12, 51])
pattern.shape = torch.Size([10, 12, 100, 100])
induction_stripe.shape = torch.Size([10, 12, 51])
pattern.shape = torch.Size([10, 12, 100, 100])
induction_stripe.shape = torch.Size([10, 12, 51])
pattern.shape = torch.Size([10, 12, 100, 100])
induction_stripe.shape = torch.Size([10, 12, 51])
pattern.shape = torch.Size([10

# lets focus on L5H5

In [62]:
if IN_GITHUB:
    torch.manual_seed(50)
    
induction_head_layer = 5
induction_head_index = 5
size = (1, 20)
input_tensor = torch.randint(1000, 10000, size)

single_random_sequence = input_tensor.to(model.cfg.device)
repeated_random_sequence = einops.repeat(single_random_sequence, "batch seq_len -> batch (2 seq_len)")
def visualize_pattern_hook(
    pattern: Float[torch.Tensor, "batch head_index dest_pos source_pos"],
    hook: HookPoint,
):
    display(
        cv.attention.attention_patterns(
            tokens=model.to_str_tokens(repeated_random_sequence), 
            attention=pattern[0, induction_head_index, :, :][None, :, :] # Add a dummy axis, as CircuitsVis expects 3D patterns.
        )
    )

model.run_with_hooks(
    repeated_random_sequence, 
    return_type=None, 
    fwd_hooks=[(
        utils.get_act_name("pattern", induction_head_layer), 
        visualize_pattern_hook
    )]
)


# Available models
- can be found [here.](https://dynalist.io/d/n2ZWtnoYHrU1s4vnFSAQ519J#z=jHj79Pj58cgJKdq4t-ygK-4h)

# examples on distill gpt2

In [63]:
distilgpt2 = HookedTransformer.from_pretrained("distilgpt2", device=device)

distilgpt2_induction_score_store = torch.zeros((distilgpt2.cfg.n_layers, distilgpt2.cfg.n_heads), device=distilgpt2.cfg.device)
def induction_score_hook(
    pattern: Float[torch.Tensor, "batch head_index dest_pos source_pos"],
    hook: HookPoint,
):
    # We take the diagonal of attention paid from each destination position to source positions seq_len-1 tokens back
    # (This only has entries for tokens with index>=seq_len)
    induction_stripe = pattern.diagonal(dim1=-2, dim2=-1, offset=1-seq_len)
    # Get an average score per head
    induction_score = einops.reduce(induction_stripe, "batch head_index position -> head_index", "mean")
    # Store the result.
    distilgpt2_induction_score_store[hook.layer(), :] = induction_score

# We make a boolean filter on activation names, that's true only on attention pattern names.
pattern_hook_names_filter = lambda name: name.endswith("pattern")

distilgpt2.run_with_hooks(
    repeated_tokens, 
    return_type=None, # For efficiency, we don't need to calculate the logits
    fwd_hooks=[(
        pattern_hook_names_filter,
        induction_score_hook
    )]
)

imshow(distilgpt2_induction_score_store, xaxis="Head", yaxis="Layer", title="Induction Score by Head in Distil GPT-2")


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loaded pretrained model distilgpt2 into HookedTransformer


# getting layer names


In [65]:
for name, param in model.named_parameters():
    if name.startswith("blocks.0."):
        print(name, param.shape)


# embedding and unembedding
for name, param in model.named_parameters():
    if not name.startswith("blocks"):
        print(name, param.shape)


blocks.0.attn.W_Q torch.Size([12, 768, 64])
blocks.0.attn.W_O torch.Size([12, 64, 768])
blocks.0.attn.b_Q torch.Size([12, 64])
blocks.0.attn.b_O torch.Size([768])
blocks.0.attn.W_K torch.Size([12, 768, 64])
blocks.0.attn.W_V torch.Size([12, 768, 64])
blocks.0.attn.b_K torch.Size([12, 64])
blocks.0.attn.b_V torch.Size([12, 64])
blocks.0.mlp.W_in torch.Size([768, 3072])
blocks.0.mlp.b_in torch.Size([3072])
blocks.0.mlp.W_out torch.Size([3072, 768])
blocks.0.mlp.b_out torch.Size([768])
embed.W_E torch.Size([50257, 768])
pos_embed.W_pos torch.Size([1024, 768])
unembed.W_U torch.Size([768, 50257])
unembed.b_U torch.Size([50257])


# getting activations hooks names

In [67]:
layer = 1
test_prompt = "The quick brown fox jumped over the lazy dog"
print("Num tokens:", len(model.to_tokens(test_prompt)[0]))

def print_name_shape_hook_function(activation, hook):
    print(hook.name, activation.shape)

not_in_late_block_filter = lambda name: name.startswith(f"blocks.{layer}.") or not name.startswith("blocks")

model.run_with_hooks(
    test_prompt,
    return_type=None,
    fwd_hooks=[(not_in_late_block_filter, print_name_shape_hook_function)],
)


Num tokens: 10
hook_embed torch.Size([1, 10, 768])
hook_pos_embed torch.Size([1, 10, 768])
blocks.1.hook_resid_pre torch.Size([1, 10, 768])
blocks.1.ln1.hook_scale torch.Size([1, 10, 1])
blocks.1.ln1.hook_normalized torch.Size([1, 10, 768])
blocks.1.ln1.hook_scale torch.Size([1, 10, 1])
blocks.1.ln1.hook_normalized torch.Size([1, 10, 768])
blocks.1.ln1.hook_scale torch.Size([1, 10, 1])
blocks.1.ln1.hook_normalized torch.Size([1, 10, 768])
blocks.1.attn.hook_q torch.Size([1, 10, 12, 64])
blocks.1.attn.hook_k torch.Size([1, 10, 12, 64])
blocks.1.attn.hook_v torch.Size([1, 10, 12, 64])
blocks.1.attn.hook_attn_scores torch.Size([1, 12, 10, 10])
blocks.1.attn.hook_pattern torch.Size([1, 12, 10, 10])
blocks.1.attn.hook_z torch.Size([1, 10, 12, 64])
blocks.1.hook_attn_out torch.Size([1, 10, 768])
blocks.1.hook_resid_mid torch.Size([1, 10, 768])
blocks.1.ln2.hook_scale torch.Size([1, 10, 1])
blocks.1.ln2.hook_normalized torch.Size([1, 10, 768])
blocks.1.mlp.hook_pre torch.Size([1, 10, 3072])
b

# folding LN
Turns out, this LayerNorm bias learns structure of the data that we can only see after folding! In particular, it essentially learns **unigram statistics** - rare tokens get suppressed, common tokens get boosted, by pretty dramatic degrees! Let's list the top and bottom 20 - at the top we see common punctuation and words like " the" and " and", at the bottom we see weird-ass tokens like " RandomRedditor":


In [68]:
unembed_bias = model.unembed.b_U
bias_values, bias_indices = unembed_bias.sort(descending=True)


top_k = 20
print(f"Top {top_k} values")
for i in range(top_k):
    print(f"{bias_values[i].item():.2f} {repr(model.to_string(bias_indices[i]))}")

print("...")
print(f"Bottom {top_k} values")
for i in range(top_k, 0, -1):
    print(f"{bias_values[-i].item():.2f} {repr(model.to_string(bias_indices[-i]))}")


Top 20 values
7.03 ','
6.98 ' the'
6.68 ' and'
6.49 '.'
6.48 '\n'
6.47 ' a'
6.41 ' in'
6.25 ' to'
6.16 ' of'
6.04 '-'
6.03 ' ('
5.88 ' "'
5.80 ' for'
5.72 ' that'
5.64 ' on'
5.59 ' is'
5.52 ' as'
5.49 ' at'
5.45 ' with'
5.44 ' or'
...
Bottom 20 values
-3.82 ' サーティ'
-3.83 '\x18'
-3.83 '\x14'
-3.83 ' RandomRedditor'
-3.83 '龍�'
-3.83 '�'
-3.83 '\x1b'
-3.83 '�'
-3.83 '\x05'
-3.83 '\x00'
-3.83 '\x06'
-3.83 '\x07'
-3.83 '\x0c'
-3.83 '\x02'
-3.83 'oreAndOnline'
-3.84 '\x11'
-3.84 '�'
-3.84 '\x10'
-3.84 '�'
-3.84 '�'


This can have real consequences for interpretability - for example, this bias favours " John" over " Mary" by about 1.2, about 1/3 of the effect size of the Indirect Object Identification Circuit! All other things being the same, this makes the John token 3.6x times more likely than the Mary token.

In [69]:
john_bias = model.unembed.b_U[model.to_single_token(' John')]
mary_bias = model.unembed.b_U[model.to_single_token(' Mary')]

print(f"John bias: {john_bias.item():.4f}")
print(f"Mary bias: {mary_bias.item():.4f}")
print(f"Prob ratio bias: {torch.exp(john_bias - mary_bias).item():.4f}x")


John bias: 2.8995
Mary bias: 1.6034
Prob ratio bias: 3.6550x
