In [1]:
# !pip install -q torch
# !pip install -q datasets
# !pip install -q sentencepiece

# !pip install -q -U bitsandbytes
# !pip install -q -U git+https://github.com/huggingface/transformers.git
# !pip install -q -U git+https://github.com/huggingface/peft.git
# !pip install -q -U git+https://github.com/huggingface/accelerate.git

# !pip install scipy
# !pip install protobuf
# !pip install scipy
# !pip install tqdm
# # !python -m pip install optimum

In [2]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, \
    BitsAndBytesConfig
from peft import PeftModel

adapters_name = "model_outputs/dna_computing_model/"
model_name = "openlm-research/open_llama_7b"


nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
    quantization_config=nf4_config,
    output_attentions=True,
)
model = PeftModel.from_pretrained(
    base_model,
    adapters_name,
)
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
# tokenizer.padding_side = "right"
tokenizer.padding_side = "left"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
def mem_check():
    print(torch.cuda.memory_summary(abbreviated=True))

mem_check()

In [None]:
with open("DNA_computing.txt", "r", encoding="utf-8") as f:
    text = f.read()
print(len(text))

In [None]:
from tqdm import tqdm
import numpy as np
from IPython.core.display import display, HTML

def aggregate_attention(attn):
    '''Extract average attention vector'''
    avged = []
    for layer in attn:
        layer_attns = layer.squeeze(0)
        attns_per_head = layer_attns.mean(dim=0)
        vec = torch.concat((
            # We zero the first entry because it's what's called
            # null attention (https://aclanthology.org/W19-4808.pdf)
            torch.tensor([0.]),
            # usually there's only one item in attns_per_head but
            # on the first generation, there's a row for each token
            # in the prompt as well, so take [-1]
            attns_per_head[-1][1:].to("cpu"),
            # add zero for the final generated token, which never
            # gets any attention
            torch.tensor([0.]),
        ))
        avged.append(vec / vec.sum())
    return torch.stack(avged).mean(dim=0)
    
def precompute_attns(text):
    # right now I'm not worried about batch inference
    inputs = tokenizer.encode(text, return_tensors="pt")
    result = []
    for n in tqdm(range(2, len(inputs[0]))):
        outputs = model(
            inputs[:,:n],
            output_attentions=True,
            return_dict=True,
        )
        result.append(aggregate_attention(outputs.attentions))
    return result

# attns = precompute_attns("DNA computing is an emerging branch of unconventional computing which uses DNA, biochemistry, and molecular")
# attns

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()
gc.collect()

In [None]:
inputs = tokenizer.encode(text, return_tensors="pt")

In [None]:
# import time
# start = time.time()
# temp_outputs = model(
#     inputs[:,:2048],
#     # output_attentions=True,
#     output_hidden_states=False,
#     past_key_values=None,
#     # use_cache=False,
# )
# end = time.time()
# print(end - start)
# temp_outputs.__dict__.keys()

In [None]:
model

In [None]:
temp_inputs = inputs[:,:2048]
token_embeds = model.model.embed_tokens(temp_inputs)
fee = model.model.layers[0](token_embeds, output_attentions=True)

# TODO: aggregate attention while going through the forward pass