In [8]:
import os
import numpy as np
import pandas as pd
import torch
from tape import TAPETokenizer, ProteinBertConfig
from model_ft import meanTAPE

In [9]:
# Case
given_HLA = "HLA-B*42:01"
given_peptide = "RPGGKKKYK"     # or a list of peptides, e.g. ["RPGGKKKYK", "RTSKAALER"]

In [10]:
# Prepare model
use_cuda = True
device = torch.device("cuda" if (torch.cuda.is_available() and use_cuda) else "cpu")
model_path = "/data/lujd/neoag_model/main_task/"
model1_filename = "HPL-Pan/cat_mean_2mlp/main_finetune_plm_tape_B32_LR3e-05_seq_clip_fold4_ep51_221104.pkl"
model_names = [model1_filename]         # add more filenames for HPL-Allele

print(">>> Model preparing")
tokenizer = TAPETokenizer(vocab='iupac')
tape_config = ProteinBertConfig.from_pretrained('bert-base')
models = []
for model_name in model_names:
    model = meanTAPE(tape_config, "2mlp").to(device)
    model.load_state_dict(
        torch.load(os.path.join(model_path, model_name), map_location=device), 
        strict = True)
    model = model.eval()
    models.append(model)
    print(f"Load model from {model_name}")

print(">>> Model preparing done")

>>> Model preparing
Load model from HPL-Pan/cat_mean_2mlp/main_finetune_plm_tape_B32_LR3e-05_seq_clip_fold4_ep51_221104.pkl
>>> Model preparing done


In [11]:
# Prepare inputs
print(">>> Input preparing")

print("Load HLA allele2sequence dict")
data_path = "/data/lujd/neoag_data/"
hla_seq_dict = pd.read_csv(
    os.path.join(data_path, "main_task/HLA_sequence_dict_ABCEG.csv"),
    index_col=0
    ).set_index(["HLA_name"])["clip"].to_dict()
HLA_seq = hla_seq_dict[given_HLA]

def seq2token(tokenizer, hla_seq, pep_seq, hla_max_len=182, pep_max_len=15):
    pep_tokens, hla_pep_tokens = [], []
    
    hla_seq = hla_seq.ljust(hla_max_len, 'X')
    hla_token = tokenizer.encode(hla_seq)

    if type(pep_seq) == str:
        pep_seq = [pep_seq]

    for seq in pep_seq:
        seq = seq.ljust(pep_max_len, 'X')
        pep_tokens.append(tokenizer.encode(seq))        # [array]

        phla_seq = hla_seq + seq
        hla_pep_tokens.append(tokenizer.encode(phla_seq))
    
    return np.array(hla_token), np.array(pep_tokens), np.array(hla_pep_tokens)

print("Convert sequence to tokens")
_, _, hla_pep_tokens = seq2token(tokenizer, HLA_seq, given_peptide)
hla_pep_inputs = torch.LongTensor(hla_pep_tokens).to(device)

print(">>> Input preparing done")

>>> Input preparing
Load HLA allele2sequence dict
Convert sequence to tokens
>>> Input preparing done


In [None]:
# Inference
for ind, model in enumerate(models):
    model.eval()
    with torch.no_grad():
        output = model(hla_pep_inputs)
        score = output[:, 1] - output[:, 0]
        score =score.cpu().detach().numpy()         # 1-D
        if ind == 0:
            score_ensemble = score
        else:
            score_ensemble = score_ensemble + score
score_ensemble = score_ensemble / len(models)
prob = 1 / (1 + np.exp(-score_ensemble))            # sigmod

if type(given_peptide) == str:
    given_peptide = [given_peptide]

for i, pep in enumerate(given_peptide):
    print("HLA: {}, peptide: {} | binding porbability: {:.4f}".format(given_HLA, pep, prob[i].item()))

HLA: HLA-B*42:01, peptide: RPGGKKKYK | binding porbability: 0.0008
