In [1]:
import torch
from models.modeling_dsm import DSM # Or DSM_ppi for binder generation

# Load a pre-trained model
model_name_or_path = "GleghornLab/DSM_650" # Replace with your model of choice
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DSM.from_pretrained(model_name_or_path).to(device).eval()
tokenizer = model.tokenizer

You are using a model of type esm_diff to instantiate a model of type dsm. This is not supported for all configurations of models and can yield errors.


In [2]:
### Unconditional generation
length = 100
mask_token = tokenizer.mask_token
# optionally, enforce starting with methionine
input_tokens = tokenizer.encode('M' + ''.join([mask_token] * (length - 1)), add_special_tokens=True, return_tensors='pt').to(device)
output = model.mask_diffusion_generate(
    tokenizer=tokenizer,
    input_tokens=input_tokens,
    step_divisor=100,          # lower is slower but better
    temperature=1.0,           # sampling temperature
    remasking="random",        # strategy for remasking tokens not kept
    preview=False,             # set this to True to watch the mask tokens get rilled in real time
    slow=False,                # adds a small delay to the real time filling (because it is usually very fast and watching carefully is hard!)
    return_trajectory=False    # set this to True to return the trajectory of the generation (what you watch in the preview)
) # Note: output will be a tuple if return_trajectory is True

generated_sequences = model.decode_output(output)
print(f"Generated sequence: {generated_sequences[0]}")

Generated sequence: MTTPIEVFRVGQKRKYLCVSDKYPLALNCLKLPMIRELHRQLGHRNWAGAEAEESPVTNERPFALKCINWPFIMMELRLQSRDDLHIRTSNDKLGPPVPW


In [3]:
# Mask Filling / Inpainting
template_sequence = "MA<mask><mask><mask>KEG<mask><mask>STL"
input_tokens = tokenizer.encode(template_sequence, add_special_tokens=True, return_tensors='pt').to(device)

output = model.mask_diffusion_generate(
    tokenizer=tokenizer,
    input_tokens=input_tokens,
    step_divisor=100,          # lower is slower but better
    temperature=1.0,           # sampling temperature
    remasking="random",        # strategy for remasking tokens not kept
    preview=False,             # set this to True to watch the mask tokens get rilled in real time
    slow=False,                # adds a small delay to the real time filling (because it is usually very fast and watching carefully is hard!)
    return_trajectory=False    # set this to True to return the trajectory of the generation (what you watch in the preview)
) # Note: output will be a tuple if return_trajectory is True

generated_sequences = model.decode_output(output)
print(f"Generated sequence: {generated_sequences[0]}")

Generated sequence: MAAGGKEGLTSTL


In [6]:
# from models.modeling_dsm import DSM_ppi
# model_binder = DSM_ppi.from_pretrained("GleghornLab/DSM_650_ppi_lora").to(device).eval()
# The lora version from the paper leads to unreliable outputs
# Synthyra has generously trained a version through full fine tuning
from models.modeling_dsm import DSM

model_binder = DSM.from_pretrained("Synthyra/DSM_ppi_full").to(device).eval()

# BBF-14
target_seq = "MGTPLWALLGGPWRGTATYEDGTKVTLDYRYTRVSPDRLRADVTYTTPDGTTLEATVDLWKDANGVIRYHATYPDGTSADGTLTQLDADTLLATGTYDDGTKYTVTLTRVAPGSGWHHHHHH"
# For binder generation, the 'interactor' (SeqB) part is what gets generated/filled.
# Start with a fully masked interactor of desired length.
interactor_template_len = 256
interactor_template = ''.join([mask_token] * interactor_template_len)

combined_input_str = target_seq + '<eos>' + interactor_template

input_tokens = tokenizer.encode(combined_input_str, add_special_tokens=True, return_tensors='pt').to(device)

output = model_binder.mask_diffusion_generate(
    tokenizer=tokenizer,
    input_tokens=input_tokens,
    step_divisor=10,          # lower is slower but better
    temperature=1.0,           # sampling temperature
    remasking="random",        # strategy for remasking tokens not kept
    preview=False,             # set this to True to watch the mask tokens get rilled in real time
    slow=False,                # adds a small delay to the real time filling (because it is usually very fast and watching carefully is hard!)
    return_trajectory=False    # set this to True to return the trajectory of the generation (what you watch in the preview)
) # Note: output will be a tuple if return_trajectory is True

target, binder = model.decode_dual_input(output, seperator='<eos>')
# Parse out the generated interactor part based on EOS tokens.
# Example: generated_full_seq_str.split(model_binder.tokenizer.eos_token)[1]
print(f"Generated binder {binder[0]}")

Generated binder HRHHHRRPTHARETEWLARMRLGIAEHQRIAVPRSDLEPDQMRERAADNQRLVKEYDQVIDHQTEGSTERLFEVLRVWEQVNTEQAHHEASAALEFGRVGYPDDEGGRAFYTQANAHKKDLVEYIGGIDEDAKWDPRIAWLMPEGGQPVKATVIGVSEERINGLKVLDDHWGRERRLWLINLFTALQAYDDPTRPTQVTLTPATDQLTNDVQYLLLSTRYTPPGVTTAVKIRKLDGRTLKVLTTEAPYVVRGATLS


In [19]:
import ast


test = '[0,1, 2, 3, 3, 1, 1, 3, 4]'

test_ast = ast.literal_eval(test)
isinstance(test_ast, list)
test_ast

[0, 1, 2, 3, 3, 1, 1, 3, 4]

In [20]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import argparse


parser = argparse.ArgumentParser()
parser.add_argument('--token', type=str, default=None)
args = parser.parse_args()


dataset.push_to_hub('account/name', token=args.token)




