In [1]:
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer, EsmForMaskedLM
from tokenizers import Tokenizer
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# ProGen2 small model initialization

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

model_name = "hugohrban/progen2-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

Using cpu device


A new version of the following files was downloaded from https://huggingface.co/hugohrban/progen2-small:
- configuration_progen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/hugohrban/progen2-small:
- modeling_progen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


ProGenForCausalLM(
  (transformer): ProGenModel(
    (wte): Embedding(32, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x ProGenBlock(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): ProGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): ProGenMLP(
          (fc_in): Linear(in_features=1024, out_features=4096, bias=True)
          (fc_out): Linear(in_features=4096, out_features=1024, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=32, bias=True)
)

In [5]:
model_name = "hugohrban/progen2-small"
initialize_progen2(model_name)

(ProGenForCausalLM(
   (transformer): ProGenModel(
     (wte): Embedding(32, 1024)
     (drop): Dropout(p=0.0, inplace=False)
     (h): ModuleList(
       (0-11): 12 x ProGenBlock(
         (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
         (attn): ProGenAttention(
           (attn_dropout): Dropout(p=0.0, inplace=False)
           (resid_dropout): Dropout(p=0.0, inplace=False)
           (qkv_proj): Linear(in_features=1024, out_features=3072, bias=False)
           (out_proj): Linear(in_features=1024, out_features=1024, bias=False)
         )
         (mlp): ProGenMLP(
           (fc_in): Linear(in_features=1024, out_features=4096, bias=True)
           (fc_out): Linear(in_features=4096, out_features=1024, bias=True)
           (act): NewGELUActivation()
           (dropout): Dropout(p=0.0, inplace=False)
         )
       )
     )
     (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
   )
   (lm_head): Linear(in_features=1024, out_features=32, bias=

In [8]:
def initialize_progen2(model_name):
    # Define tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
    # Evaluate model
    model.eval()

    return model, tokenizer

def collect_log_prob_pg2(sequence, model, tokenizer, device="cpu"):

    # Define indices for log-likelihood ratio matrix
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    aa_token_ids = [tokenizer.convert_tokens_to_ids(aa) for aa in amino_acids]

    prompt = "1"+sequence

    input_ids = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(model.device)
    with torch.no_grad():
        logits = model(input_ids).logits
    
    log_probs = F.log_softmax(logits, dim = -1)

    ref_log_probs = log_probs[0, torch.arange(input_ids.size(1)), input_ids[0]]
    ref_log_probs = ref_log_probs.unsqueeze(1)
    ref_log_probs = ref_log_probs[1:]

    log_probs = log_probs[0,1:]

    llr_matrix = log_probs - ref_log_probs
    llr_matrix = llr_matrix[:, aa_token_ids]

    return log_probs, ref_log_probs, llr_matrix



In [7]:
crx_sequence = """
MMAYMNPGPHYSVNALALSGPSVDLMHQAVPYPSAPRKQRRERTTFTRSQLEELEALFAKTQYPDVYAREEVALKINLPESRVQVWFKNRRAKCRQQRQQQKQQQQPPGGQAKARPAKRKAGTSPRPSTDVCPDPLGISDSYSPPLPGPSGSPTTAVATVSIWSPASESPLPEAQRAGLVASGPSLTSAPYAMTYAPASAFCSSPSAYGSPSSYFSGLDPYLSPMVPQLGGPALSPLSGPSVGPSLAQSPTSLSGQSYGAYSPVDSLEFKDPTGTWKFTYNPMDPLDYKDQSAWKFQIL
""".replace('\n', '').replace(' ', '')
prompt_crx = "1" + crx_sequence

In [10]:
model, tokenizer = initialize_progen2(model_name)

In [11]:
collect_log_prob_pg2(crx_sequence,model,tokenizer)

(tensor([[-36.2644, -36.3007, -32.9435,  ..., -21.1581, -36.3621, -36.3414],
         [-34.1407, -34.1769, -30.7594,  ..., -18.2608, -34.2408, -34.2193],
         [-39.6851, -39.7304, -29.1338,  ..., -17.7250, -39.8043, -39.7781],
         ...,
         [-37.8829, -37.9326, -28.2506,  ..., -17.4916, -38.0190, -37.9870],
         [-37.4543, -37.5023, -29.3698,  ..., -18.1899, -37.5809, -37.5523],
         [-36.5557, -36.6059, -28.3583,  ..., -17.8775, -36.6887, -36.6580]]),
 tensor([[-4.5802e+00],
         [-4.2288e+00],
         [-2.2482e+00],
         [-3.6050e+00],
         [-3.6774e+00],
         [-2.8356e+00],
         [-2.6714e+00],
         [-2.6214e+00],
         [-2.2045e+00],
         [-2.9178e+00],
         [-2.8900e+00],
         [-2.2177e+00],
         [-3.0083e+00],
         [-2.6223e+00],
         [-2.3427e+00],
         [-2.5151e+00],
         [-2.0879e+00],
         [-2.2736e+00],
         [-2.1795e+00],
         [-2.1865e+00],
         [-2.4332e+00],
         [-1.8647e

In [13]:
def seq_matrix_dict_pg2(sequence_list, model, tokenizer,device="cpu"):

    seq_dict = dict()

    n = len(sequence_list)

    for i in range(n):
        sequence = sequence_list[i]
        lp, rlp, llr = collect_log_prob_pg2(crx_sequence,model,tokenizer)

        seq_dict[i] = {'log_probs': lp, 'ref_log_probs': rlp, 'llr_matrix': llr}

    return seq_dict