In [8]:
from huggingface import HyenaDNAPreTrainedModel
from standalone_hyenadna import CharacterTokenizer
import torch


In [6]:
pretrained_model_name = 'hyenadna-tiny-1k-seqlen'
n_classes = 2
use_head =  False
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device)

Using device: cuda


In [7]:
model = HyenaDNAPreTrainedModel.from_pretrained(
            './checkpoints',
            pretrained_model_name,
            download = True,
            config = None,
            device = device,
            use_head =  use_head,
            n_classes = n_classes,
        )

Updated Git hooks.
Git LFS initialized.


Cloning into 'hyenadna-tiny-1k-seqlen'...


Loaded pretrained weights ok!


  loaded_ckpt = torch.load(


In [9]:
max_lengths = {
        'hyenadna-tiny-1k-seqlen': 1024,
        'hyenadna-small-32k-seqlen': 32768,
        'hyenadna-medium-160k-seqlen': 160000,
        'hyenadna-medium-450k-seqlen': 450000,  # T4 up to here
        'hyenadna-large-1m-seqlen': 1_000_000,  # only A100 (paid tier)
    }

max_length = max_lengths[pretrained_model_name]  # auto selects

In [10]:
# create tokenizer
tokenizer = CharacterTokenizer(
    characters=['A', 'C', 'G', 'T', 'N'],  # add DNA characters, N is uncertain
    model_max_length=max_length + 2,  # to account for special tokens, like EOS
    add_special_tokens=False,  # we handle special tokens elsewhere
    padding_side='left', # since HyenaDNA is causal, we pad on the left
)

In [11]:
#### Single embedding example ####

# create a sample 450k long, prepare
sequence = 'ACTG' * int(max_length/4)
tok_seq = tokenizer(sequence)
tok_seq = tok_seq["input_ids"]  # grab ids

# place on device, convert to tensor
tok_seq = torch.LongTensor(tok_seq).unsqueeze(0)  # unsqueeze for batch dim
tok_seq = tok_seq.to(device)

In [12]:
# prep model and forward
model.to(device)
model.eval()
with torch.inference_mode():
    embeddings = model(tok_seq)

print(embeddings.shape)  # embeddings here!

torch.Size([1, 1026, 128])
