In [None]:
import re
import gc
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from model import *
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torch.utils.data import DataLoader
from dataset import DisulfidEmbeddingDataset
from transformers import T5EncoderModel, T5Tokenizer


DEVICE = "cuda:0"
PROTT5_PATH = "Rostlab/prot_t5_xl_uniref50"
MODEL_PATH = "./lightning_logs/all_res_model_with_noise_ft_50/all_res_model_with_noise_ft.pt"
CONFIG = dict(
    # Computation Resource
    gpu_id=[0],
    num_thread=16,
    loader_num_workers=16,
    loader_prefetch_factor=4,
    # For Training
    lr=5e-5,
    l2_lambda=0.1,
    batch_size=1,#6,
    random_seed=0,
    min_epochs=1,
    max_epochs=10,
    gradient_clip=1.0,
    lr_scheduler="LinearWarmup",
    # For Model
    d_model=128,
    d_ff=256,
    num_layers=2,
    num_heads=4,
    dropout_rate=0.1,
    is_decoder=False,
    use_cache=False,
    is_encoder_decoder=False,
    feed_forward_proj="gated-gelu",
)

In [123]:
config = T5Config(
    d_ff=CONFIG["d_ff"],
    d_model=CONFIG["d_model"],
    num_heads=CONFIG["num_heads"],
    num_layers=CONFIG["num_layers"],
    dropout_rate=CONFIG["dropout_rate"],
    is_decoder = False,
    use_cache = False,
    is_encoder_decoder = False,
)
model = DisulfidModel(
    config,
    learning_rate=CONFIG["lr"],
    epochs=CONFIG["max_epochs"],
    l2_lambda=CONFIG["l2_lambda"],
    lr_scheduler=CONFIG["lr_scheduler"],
    steps_per_epoch=0,
)
model.load_state_dict(torch.load(MODEL_PATH))
model = model.eval().to(DEVICE)

In [6]:
tokenizer = T5Tokenizer.from_pretrained(PROTT5_PATH, do_lower_case=False,)
emb_model = T5EncoderModel.from_pretrained(PROTT5_PATH)
emb_model = emb_model.to(DEVICE)
emb_model = emb_model.eval()
gc.collect()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


36

In [None]:
def get_emb(seq):
    seq = [" ".join(seq)]
    seq = [re.sub(r"[UZOB]", "X", sequence) for sequence in seq]
    ids = tokenizer.batch_encode_plus(seq, add_special_tokens=True, padding=True)
    input_ids = torch.tensor(ids['input_ids']).to(DEVICE)
    attention_mask = torch.tensor(ids['attention_mask']).to(DEVICE)
    with torch.no_grad():
        embedding = emb_model(input_ids=input_ids,attention_mask=attention_mask)
    embedding = embedding.last_hidden_state.cpu().numpy()
    seq_len = (attention_mask[0] == 1).sum()
    embedding = embedding[0][:seq_len-1]
    return embedding

In [None]:
seq = "CNCKRFPQCPLNFLC"
emb = get_emb(seq)
emb = F.pad(
    torch.tensor(emb, dtype=torch.float32),
    (0, 0, 0, 1024 - emb.shape[0]),
).unsqueeze(0)

mask = torch.zeros(1024, dtype=torch.float32)
mask[:len(seq)] = 1
mask = mask.unsqueeze(0)
pred = model.forward(emb.to(DEVICE), mask.to(DEVICE))

(torch.sigmoid(pred[mask.bool()].view(-1))>0.5), torch.sigmoid(pred[mask.bool()].view(-1))

tensor(4, device='cuda:4') 4


(tensor([ True, False,  True, False, False, False, False, False,  True, False,
         False, False, False, False,  True], device='cuda:4'),
 tensor([9.9978e-01, 2.5345e-04, 9.9985e-01, 2.2168e-04, 5.8087e-04, 2.6370e-04,
         4.2629e-04, 5.3604e-04, 9.9981e-01, 2.7754e-04, 2.7818e-04, 2.1661e-04,
         2.2153e-04, 2.8626e-04, 9.9984e-01], device='cuda:4',
        grad_fn=<SigmoidBackward0>))