In [1]:
!pip install -q torch
!pip install -q datasets
!pip install -q sentencepiece

!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git

!pip install scipy
!pip install protobuf
!pip install scipy
!pip install tqdm
# !python -m pip install optimum

[0m

In [1]:
!pip install bertviz

Collecting bertviz
  Downloading bertviz-1.4.0-py3-none-any.whl (157 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.6/157.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting boto3
  Downloading boto3-1.34.14-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Collecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting botocore<1.35.0,>=1.34.14
  Downloading botocore-1.34.14-py3-none-any.whl (11.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.9/11.9 MB[0m [31m118.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting s3transfer<0.11.0,>=0.10.0
  Downloading s3transfer-0.10.0-py3-none-any.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jmespath, botocore, s3transfer, boto3, 

In [2]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, \
    BitsAndBytesConfig
from peft import PeftModel

adapters_name = "model_outputs/dna_computing_model/"
model_name = "openlm-research/open_llama_7b"


nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
    quantization_config=nf4_config,
    output_attentions=True,
)
model = PeftModel.from_pretrained(
    base_model,
    adapters_name,
)
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token_id = tokenizer.eos_token_id
# tokenizer.padding_side = "right"
tokenizer.padding_side = "left"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
def mem_check():
    print(torch.cuda.memory_summary(abbreviated=True))

mem_check()

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   3748 MiB |   3942 MiB |  42133 MiB |  38384 MiB |
|---------------------------------------------------------------------------|
| Active memory         |   3748 MiB |   3942 MiB |  42133 MiB |  38384 MiB |
|---------------------------------------------------------------------------|
| Requested memory      |   3726 MiB |   3919 MiB |  42074 MiB |  38347 MiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |   4118 MiB |   4118 MiB |   4204 MiB |  88064 KiB |
|---------------------------------------------------------------

In [4]:
with open("DNA_computing.txt", "r", encoding="utf-8") as f:
    text = f.read()
print(len(text))

16608


In [5]:
from tqdm import tqdm
import numpy as np
from IPython.core.display import display, HTML

def aggregate_attention(attn):
    '''Extract average attention vector'''
    avged = []
    for layer in attn:
        layer_attns = layer.squeeze(0)
        attns_per_head = layer_attns.mean(dim=0)
        vec = torch.concat((
            # We zero the first entry because it's what's called
            # null attention (https://aclanthology.org/W19-4808.pdf)
            torch.tensor([0.]),
            # usually there's only one item in attns_per_head but
            # on the first generation, there's a row for each token
            # in the prompt as well, so take [-1]
            attns_per_head[-1][1:].to("cpu"),
            # add zero for the final generated token, which never
            # gets any attention
            torch.tensor([0.]),
        ))
        avged.append(vec / vec.sum())
    return torch.stack(avged).mean(dim=0)
    
def precompute_attns(text):
    # right now I'm not worried about batch inference
    inputs = tokenizer.encode(text, return_tensors="pt")
    result = []
    for n in tqdm(range(2, len(inputs[0]))):
        outputs = model(
            inputs[:,:n],
            output_attentions=True,
            return_dict=True,
        )
        result.append(aggregate_attention(outputs.attentions))
    return result

# attns = precompute_attns("DNA computing is an emerging branch of unconventional computing which uses DNA, biochemistry, and molecular")
# attns

  from IPython.core.display import display, HTML


In [6]:
mem_check()

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   3748 MiB |   3942 MiB |  42133 MiB |  38384 MiB |
|---------------------------------------------------------------------------|
| Active memory         |   3748 MiB |   3942 MiB |  42133 MiB |  38384 MiB |
|---------------------------------------------------------------------------|
| Requested memory      |   3726 MiB |   3919 MiB |  42074 MiB |  38347 MiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |   4118 MiB |   4118 MiB |   4204 MiB |  88064 KiB |
|---------------------------------------------------------------

In [7]:
tokenizer.pad_token_id = 0
inputs = tokenizer.encode(text, return_tensors="pt")

Token indices sequence length is longer than the specified maximum sequence length for this model (3606 > 2048). Running this sequence through the model will result in indexing errors


In [42]:
import gc
gc.collect()
torch.cuda.empty_cache()
gc.collect()

0

In [43]:
import time
start = time.time()
temp_outputs = model(
    inputs[:,:2048],
    # output_attentions=True,
    output_hidden_states=False,
    past_key_values=None,
    # use_cache=False,
)
end = time.time()
print(end - start)
temp_outputs.__dict__.keys()

OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB (GPU 0; 23.65 GiB total capacity; 22.22 GiB already allocated; 143.19 MiB free; 22.99 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [61]:
model.model.layers

ModuleList(
  (0-31): 32 x LlamaDecoderLayer(
    (self_attn): LlamaAttention(
      (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
      (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
      (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
      (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
      (rotary_emb): LlamaRotaryEmbedding()
    )
    (mlp): LlamaMLP(
      (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
      (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
      (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
      (act_fn): SiLU()
    )
    (input_layernorm): LlamaRMSNorm()
    (post_attention_layernorm): LlamaRMSNorm()
  )
)

In [63]:
temp_inputs = inputs[:,:20]
token_embeds = model.model.embed_tokens(temp_inputs)
fee = model.model.layers[0](token_embeds)
fee.shape

(tensor([[[ 0.0026,  0.0007,  0.0051,  ..., -0.0124, -0.0127, -0.0171],
          [-0.0155,  0.0312,  0.0454,  ...,  0.0222,  0.0637, -0.0302],
          [-0.0121,  0.0517,  0.0172,  ...,  0.0102,  0.0107, -0.0071],
          ...,
          [-0.0025, -0.0113,  0.0205,  ...,  0.0177,  0.0241,  0.0006],
          [-0.0021,  0.0199, -0.0120,  ..., -0.0064,  0.0023,  0.0169],
          [-0.0121,  0.0444,  0.0164,  ...,  0.0384,  0.0787, -0.0613]]],
        device='cuda:0', dtype=torch.float16),)

In [71]:
fii = model.model.layers[1](fee[0], output_attentions=True)
len(fii), fii[0].shape

(2, torch.Size([1, 20, 4096]))

In [72]:
fii

(tensor([[[ 0.0259,  0.0146, -0.0294,  ..., -0.0293, -0.0601, -0.0659],
          [-0.0770, -0.0317,  0.0940,  ...,  0.0966,  0.0543, -0.0594],
          [-0.0565,  0.0544, -0.0250,  ..., -0.0226, -0.0362, -0.0226],
          ...,
          [ 0.0070, -0.0159, -0.0090,  ...,  0.0292,  0.0454,  0.0007],
          [ 0.0086,  0.0282, -0.0274,  ..., -0.0065,  0.0262,  0.0157],
          [-0.0919,  0.0856,  0.0387,  ...,  0.0266,  0.1255, -0.0764]]],
        device='cuda:0', dtype=torch.float16),
 tensor([[[[4.4385e-01, 9.7534e-02, 3.6804e-02,  ..., 2.0203e-02,
            2.5940e-02, 1.2230e-02],
           [5.4626e-02, 4.2999e-02, 4.9255e-02,  ..., 8.6731e-02,
            6.8115e-02, 2.3834e-02],
           [4.1809e-02, 4.0588e-02, 4.9622e-02,  ..., 1.0004e-01,
            7.3425e-02, 2.4399e-02],
           ...,
           [1.2610e-01, 1.5839e-02, 1.9455e-02,  ..., 8.8379e-02,
            1.0052e-01, 5.3070e-02],
           [6.3232e-02, 1.2550e-02, 1.5747e-02,  ..., 9.2041e-02,
          

In [59]:
foo.shape

torch.Size([1, 20, 4096])

In [19]:
temp_outputs.logits.shape

torch.Size([1, 2048, 32000])

In [20]:
temp_outputs.past_key_values

((tensor([[[[-1.7676e-01, -6.9141e-01, -4.2578e-01,  ..., -4.3555e-01,
             -2.4902e-01,  1.5430e-01],
            [-6.7871e-02, -4.1748e-01, -1.1016e+00,  ...,  6.2891e-01,
             -9.2224e-02,  5.3516e-01],
            [-1.4805e+00,  3.4717e-01, -1.1104e+00,  ...,  1.8738e-02,
             -2.8857e-01, -4.6692e-02],
            ...,
            [-4.8462e-01, -7.1582e-01, -7.5439e-01,  ...,  5.1025e-01,
              1.3745e-01, -4.6997e-01],
            [-5.4688e-02, -7.9395e-01, -1.0801e+00,  ...,  3.4326e-01,
             -1.1987e-01,  5.4736e-01],
            [ 4.8291e-01, -2.0605e-01,  7.0264e-01,  ..., -1.4648e-01,
              2.1758e+00,  1.1455e+00]],
  
           [[ 8.2520e-02,  2.6953e-01,  2.1973e-01,  ..., -1.2891e+00,
              6.0156e-01,  2.1387e-01],
            [ 4.2139e-01,  1.2646e-01,  1.0850e+00,  ...,  2.1594e-01,
             -9.8047e-01,  7.3047e-01],
            [-1.3008e+00, -1.3359e+00,  9.8389e-02,  ...,  9.6924e-01,
             -8.2861

In [21]:
temp_outputs.past_key_values[0][1].shape, temp_outputs.past_key_values[0][0].shape

(torch.Size([1, 32, 2048, 128]), torch.Size([1, 32, 2048, 128]))

In [24]:
len(temp_outputs.attentions), temp_outputs.hidden_states

(32, None)

In [15]:
# cuda_model = model.to("cuda")
# model = torch.compile(model)

In [16]:
model.model.layers

ModuleList(
  (0-31): 32 x LlamaDecoderLayer(
    (self_attn): LlamaAttention(
      (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
      (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
      (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
      (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
      (rotary_emb): LlamaRotaryEmbedding()
    )
    (mlp): LlamaMLP(
      (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
      (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
      (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
      (act_fn): SiLU()
    )
    (input_layernorm): LlamaRMSNorm()
    (post_attention_layernorm): LlamaRMSNorm()
  )
)

In [51]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    

In [17]:
len(temp_outputs.attentions)

32

In [18]:
temp_outputs.attentions[0].shape

torch.Size([1, 32, 2048, 2048])

torch._dynamo.optimize is called on a non function object.
If this is a callable class, please wrap the relevant code into a function and optimize the
wrapper function.

>> class CallableClass:
>>     def __init__(self):
>>         super().__init__()
>>         self.relu = torch.nn.ReLU()
>>
>>     def __call__(self, x):
>>         return self.relu(torch.sin(x))
>>
>>     def print_hello(self):
>>         print("Hello world")
>>
>> mod = CallableClass()

If you want to optimize the __call__ function and other code, wrap that up in a function

>> def wrapper_fn(x):
>>     y = mod(x)
>>     return y.sum()

and then optimize the wrapper_fn

>> opt_wrapper_fn = torch._dynamo.optimize(wrapper_fn)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

In [36]:
import optimum

# better_model = model.to_bettertransformer()
# model.reverse_bettertransformer()
start = time.time()
temp_inputs = inputs[:,:2048].to("cuda")
# with torch.no_grad():
temp_outputs = model(
    temp_inputs,
    # output_attentions=True,
)
end = time.time()
print(end - start)
temp_outputs.__dict__.keys()

RuntimeError: 

torch._dynamo.optimize is called on a non function object.
If this is a callable class, please wrap the relevant code into a function and optimize the
wrapper function.

>> class CallableClass:
>>     def __init__(self):
>>         super().__init__()
>>         self.relu = torch.nn.ReLU()
>>
>>     def __call__(self, x):
>>         return self.relu(torch.sin(x))
>>
>>     def print_hello(self):
>>         print("Hello world")
>>
>> mod = CallableClass()

If you want to optimize the __call__ function and other code, wrap that up in a function

>> def wrapper_fn(x):
>>     y = mod(x)
>>     return y.sum()

and then optimize the wrapper_fn

>> opt_wrapper_fn = torch._dynamo.optimize(wrapper_fn)


In [None]:
# torch.__version__

In [None]:
# from transformers import __version__ as tv
# tv

In [21]:
temp_outputs.loss.keys()

dict_keys(['logits', 'past_key_values', 'attentions'])

In [24]:
start = time.time()
temp_outputs = model(
    inputs[:,:2048],
    output_attentions=False,
    # output_scores=False
)
end = time.time()
print(end - start)
temp_outputs.__dict__.keys()

RuntimeError: 

torch._dynamo.optimize is called on a non function object.
If this is a callable class, please wrap the relevant code into a function and optimize the
wrapper function.

>> class CallableClass:
>>     def __init__(self):
>>         super().__init__()
>>         self.relu = torch.nn.ReLU()
>>
>>     def __call__(self, x):
>>         return self.relu(torch.sin(x))
>>
>>     def print_hello(self):
>>         print("Hello world")
>>
>> mod = CallableClass()

If you want to optimize the __call__ function and other code, wrap that up in a function

>> def wrapper_fn(x):
>>     y = mod(x)
>>     return y.sum()

and then optimize the wrapper_fn

>> opt_wrapper_fn = torch._dynamo.optimize(wrapper_fn)


In [23]:
model

OptimizedModule(
  (_orig_mod): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(32000, 4096, padding_idx=0)
      (layers): ModuleList(
        (0-31): 32 x LlamaDecoderLayer(
          (self_attn): LlamaAttention(
            (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
            (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
            (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
            (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
            (rotary_emb): LlamaRotaryEmbedding()
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
            (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
            (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRMSNorm()
 

In [211]:
from tqdm import tqdm
import numpy as np
from IPython.display import display, HTML

def heterogenous_stack(vecs):
    '''Pad vectors with zeros then stack'''
    max_length = max(v.shape[0] for v in vecs)
    return torch.stack([
        torch.concat((v, torch.zeros(max_length - v.shape[0])))
        for v in vecs
    ])

class AttentionVisualizer:

    def __init__(self, model, tokenizer, drop_low_info_tokens=True):
        self.low_info_tokens = [tokenizer.encode(token)[1]
            for token in [
                ",",
                "the",
                "of",
                # "'",
            ]
        ]
        # 31876 is the token for an apostrophe '
        # If I pass in "'" to the encode function, it will be encoded as
        # 910, which is different from what I get if there are characters preceding
        # it
        self.low_info_tokens.append(31876)
        self.model = model
        self.tokenizer = tokenizer
        self.drop_low_info_tokens = drop_low_info_tokens

    def fit(self, text):
        """
        Don't have to worry about fine-tuning the model on text yet
        """
        self.raw_text = text
        self.tokens_encoded = self.tokenizer.encode(text, return_tensors="pt")#.to("mps")
        self.tokens_decoded = [self.tokenizer.decode(token) for token in self.tokens_encoded[0]]
        self._precompute_attns()

    def filtered_aggregate_attention(self, inputs, attn, index):
        # TODO flag for low info tokens
        # print(inputs[0].shape)
        #print(inputs)
        # n_tokens = len(tokens[1:])
        # n_tokens = len(inputs[0][1:])
        n_tokens = len(inputs[0][1:])
        has_info = np.ones(n_tokens)
        if self.drop_low_info_tokens:
            # has_info = (inputs[0][1:] != torch.tensor(self.low_info_tokens).reshape(-1, 1)).all(0)
            has_info = (inputs[0][1:] != torch.tensor(self.low_info_tokens).reshape(-1, 1)).all(0)
        # print(n_tokens, has_info, has_info.shape, inputs.shape)
        avged = []
        for layer in attn:
            # print("layer.shape:", layer.shape)
            layer_attns = layer[index-1].squeeze(0)
            # print("layer_attns.shape:", layer_attns.shape)
            # layer_attns = layer.squeeze(0)
            attns_per_head = layer_attns.mean(dim=0)
            # print("attns_per_head.shape:", attns_per_head.shape)
            # print("has_info.shape:", has_info.shape)
            # assert attns_per_head[-1][1:].shape[0] == has_info.shape[0], (attns_per_head[-1][1:].shape, has_info.shape)
            # print((attns_per_head[-1][1:].to("cpu") * has_info).shape)
            # print(attns_per_head[-1][1:].shape) # 19, 20
            # foo = attns_per_head[-1][1:].to("cpu") * has_info
            vec = torch.concat((
                # We zero the first entry because it's what's called
                # null attention (https://aclanthology.org/W19-4808.pdf)
                torch.tensor([0.]),
                # usually there's only one item in attns_per_head but
                # on the first generation, there's a row for each token
                # in the prompt as well, so take [-1]
                attns_per_head[-1] * has_info,
                # generated token gets 0 weight
                torch.tensor([0.]),
            ))
            avged.append(vec / vec.sum())
        return torch.stack(avged).mean(dim=0)

    def _precompute_attns(self):
        n = len(self.tokens_encoded[0])
        batched_ids = []
        attention_mask = []
        batched_ids = [
            list(self.tokens_encoded[0,:m]) + [tokenizer.pad_token_id] * (n - m)
            for m in range(1, n+1)
        ]
        attention_mask = [
            [1] * m + [0] * (n - m)
            for m in range(1, n+1)
        ]
        outputs = model(
            torch.tensor(batched_ids),
            attention_mask=torch.tensor(attention_mask),
            output_attentions=True,
            return_dict=True,
        )
        # TODO how to use filtered_aggregate_attention???
        self.precomputed_attentions = [
            self.filtered_aggregate_attention(
                self.tokens_encoded,
                outputs.attentions,
                m,
                # outputs.attentions[:,:m,:,:m,:m]
            )
            for m in range(2, n+1)
        ]
        return None

    def generate_attentions_for_selection(self, selected_start, n_selected=1):
        assert n_selected == 1, "Only one selection supported for now! (TODO)"
        # inputs = self.tokens_encoded[:, 0:(selected_start)]
        # print(self.tokens_decoded)
        # inputs = self.tokens_decoded[:, 0:selected_start]
        inputs = self.tokens_decoded[0:selected_start]
        print("generate_attns inputs.shape:", len(inputs))
        attn_m = heterogenous_stack(
            [
                torch.tensor([
                    1 if i == j else 0
                    for j, token in enumerate(inputs)
                ])
                for i, token in enumerate(inputs)
            ] +
            # list(map(aggregate_attention, outputs.attentions))
            # [self.filtered_aggregate_attention(inputs, outputs.attentions[0])]
            # [self.precomputed_attentions[selected_start+1]]
            [self.precomputed_attentions[selected_start-1]]
        )
        return attn_m

    def viz(self, selected_start, n_selected=1):
        """
        Visualize attention for a given selection
        """
        assert n_selected == 1, "Only one selection supported for now! (TODO)"
        attn_m = self.generate_attentions_for_selection(selected_start, n_selected)

        # Create a vector based on selected tokens
        # selected_vec = np.zeros(len(self.tokens_encoded[0]))
        print(attn_m.shape)
        selected_vec = np.zeros(attn_m.shape[0])
        selected_vec[selected_start:(selected_start + n_selected)] = 1

        # Calculate attention vector
        # attn_vec = np.dot(vec, attn_matrix.T)
        attn_vec = np.matmul(selected_vec, attn_m)
        min_val, max_val = min(attn_vec), max(attn_vec)
        attn_vec = (attn_vec - min_val) / (max_val - min_val)
        # Generate the HTML code for each token
        spans = []

        print(selected_vec.shape, attn_m.shape, attn_vec.shape, len(self.tokens_decoded))
        for i in range(1, len(self.tokens_decoded)):
            token = self.tokens_decoded[i]
            # if token == "<s>":
            #     # think this is a special token. Possibly the start of the prompt?
            #     continue
            attn = 0
            selected = False
            if i < attn_vec.shape[0]:
                attn = attn_vec[i]
                selected = False if i >= len(selected_vec) else selected_vec[i] == 1
            underline_style = "text-decoration: underline;" if selected else ""
            spans.append(f'<span style="background-color: rgba(255, 0, 0, {attn:.2f}); {underline_style}">{token}</span>')

        # Join the spans and display
        display(HTML(' '.join(spans)))

        # print(attn_matrix)
        print(attn_vec)                 

text_sub = "DNA computing is an emerging branch of unconventional computing which uses DNA, biochemistry, and molecular"

viz = AttentionVisualizer(model, tokenizer, drop_low_info_tokens=True)
viz.fit(text_sub)
viz.viz(13)

RuntimeError: The size of tensor a (20) must match the size of tensor b (19) at non-singleton dimension 0

In [209]:
viz.viz(16)

generate_attns inputs.shape: 16
torch.Size([17, 22])
(17,) torch.Size([17, 22]) torch.Size([22]) 20


tensor([0.0000, 1.0000, 0.0115, 0.0128, 0.0136, 0.0106, 0.0057, 0.0058, 0.0000,
        0.0031, 0.0045, 0.0089, 0.0120, 0.0187, 0.0264, 0.0462, 0.0099, 0.0252,
        0.0000, 0.0000, 0.0000, 0.0000], dtype=torch.float64)


In [204]:
viz.tokens_decoded[13]

'DNA'

In [97]:
viz.fit(text)
viz.viz(63)

 44%|████▎     | 1576/3605 [2:52:41<3:42:19,  6.57s/it] 


KeyboardInterrupt: 

In [None]:
viz.viz(100)

In [132]:
# Define PAD Token = BOS Token
# tokenizer.pad_token = tokenizer.bos_token
# model.config.pad_token_id = model.config.bos_token_id
tokenizer.pad_token = "[PAD]"
tokenizer.padding_side = "left"

a = tokenizer.decode(inputs[0,1:10])
b = tokenizer.decode(inputs[0,1:20])
# fee = heterogenous_stack([inputs[0,1:10],inputs[0,1:20]])
# c = tokenizer.encode(fee, return_tensors="pt")
c = tokenizer.batch_encode([a,b], return_tensors="pt", padding=True)
c

AttributeError: 'LlamaTokenizerFast' object has no attribute 'batch_encode'

In [139]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_sub))

def precompute(text_sub):
    tokens = tokenizer.tokenize(text_sub)
    ids = tokenizer.convert_tokens_to_ids(tokens)
    n = len(ids)
    batch_ids = []
    attn_mask = []
    for m in range(1, n+1):
        batch_ids.append(
            ids[:m] + [tokenizer.pad_token_id] * (n - m)
        )
        attn_mask.append(
            np.ones(n)
        )
        attn_mask[-1][m:] = 0
        attn_mask[-1] = np.array(attn_mask[-1])
    outputs = model(
        torch.tensor(batch_ids),
        attention_mask=torch.tensor(attn_mask),
        output_attentions=True,
        return_dict=True,
    )
    return outputs

outputs = precompute(text_sub)
len(outputs)

3

In [143]:
len(outputs.attentions[0])

19

In [144]:
outputs.attentions[0].shape

torch.Size([19, 32, 19, 19])

In [145]:
len(tokenizer.tokenize(text_sub))

19

In [158]:
outputs.attentions[0][3][31]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.3037, 0.6963, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.1184, 0.2172, 0.6646, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0729, 0.1145, 0.3923, 0.4202, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0111, 0.0364, 0.3926, 0.5601, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0201, 0.0406, 0.4421, 0.4971, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.

In [131]:
model(c, output_attentions=True, return_dict=True).attentions[0].shape

torch.Size([1, 32, 30, 30])

In [117]:
tokenizer.encode(fee)

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [99]:
c_outputs = model(
    c,
    output_attentions=True,
    return_dict=True,
)
c_outputs.attentions

(tensor([[[[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.4634, 0.5366, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.2578, 0.2788, 0.4636,  ..., 0.0000, 0.0000, 0.0000],
           ...,
           [0.1552, 0.0176, 0.0197,  ..., 0.0081, 0.0000, 0.0000],
           [0.0775, 0.0229, 0.0270,  ..., 0.0213, 0.0583, 0.0000],
           [0.0643, 0.0161, 0.0180,  ..., 0.0226, 0.0482, 0.1124]],
 
          [[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.7905, 0.2094, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.5000, 0.1364, 0.3635,  ..., 0.0000, 0.0000, 0.0000],
           ...,
           [0.1318, 0.0125, 0.0106,  ..., 0.0567, 0.0000, 0.0000],
           [0.1168, 0.0111, 0.0097,  ..., 0.0707, 0.0328, 0.0000],
           [0.1392, 0.0100, 0.0072,  ..., 0.0363, 0.0436, 0.0341]],
 
          [[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.3203, 0.6797, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.5454, 0.14

In [102]:
len(c_outputs.attentions[0][0])

32

In [105]:
c.shape

torch.Size([1, 30])

In [85]:
viz.precomputed_attentions

[]

In [66]:
len(tokenizer.encode(a, return_tensors="pt")[0])


10

In [69]:
curr = tokenizer.encode(a, return_tensors="pt")
curr[:,:5]

tensor([[    1,  7569, 12563,   322,   363]])

In [61]:
model(
    tokenizer.encode(a, return_tensors="pt"),
    output_attentions=True,
    return_dict=True,
)

CausalLMOutputWithPast(loss={'logits': tensor([[[-18.1406, -15.1016,  -5.5977,  ...,  -8.8047,  -9.9766, -10.6953],
         [-16.7969, -16.0469,  -6.7461,  ...,  -8.5625,  -8.9375, -11.0312],
         [-19.4688, -14.8438,  -1.7852,  ..., -11.6719, -11.4141, -12.6172],
         ...,
         [-18.0625, -15.2500,  -1.1396,  ...,  -7.8008,  -7.7188,  -9.3438],
         [-12.8203,  -7.7930,   3.3809,  ...,  -4.8711,  -2.0977,  -4.9414],
         [-17.7812, -13.7031,   0.0336,  ...,  -8.3359,  -8.3594,  -7.6523]]]), 'past_key_values': ((tensor([[[[-0.1768, -0.6914, -0.4258,  ..., -0.4355, -0.2490,  0.1543],
          [-0.0679, -0.4175, -1.1016,  ...,  0.6289, -0.0922,  0.5352],
          [-1.4805,  0.3472, -1.1104,  ...,  0.0187, -0.2886, -0.0467],
          ...,
          [-0.3816, -0.9072, -0.2661,  ...,  0.1405, -0.3401,  1.3340],
          [-1.8662, -0.1570,  0.3367,  ...,  0.9009, -0.2003,  0.7539],
          [-1.9121,  0.2529, -0.7671,  ...,  0.3948, -0.5767,  0.1206]],

         [[ 

In [62]:
mem_check()

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   7493 MiB |   7681 MiB | 109565 MiB | 102072 MiB |
|---------------------------------------------------------------------------|
| Active memory         |   7493 MiB |   7681 MiB | 109565 MiB | 102072 MiB |
|---------------------------------------------------------------------------|
| Requested memory      |   7450 MiB |   7638 MiB | 109412 MiB | 101962 MiB |
|---------------------------------------------------------------------------|
| GPU reserved memory   |   8156 MiB |   8156 MiB |   8328 MiB | 176128 KiB |
|---------------------------------------------------------------