In [1]:
import sys 
if ".." not in sys.path:
    sys.path.append("..")

import torch
from mutation_release import get_mutated_peptides
from tape import TAPETokenizer, ProteinBertConfig
from model_ft import meanTAPE

---
# HPL-Allele
## 1. HLA-B*42:01

In [2]:
# prepare model
use_cuda = True
device = torch.device("cuda:7" if (torch.cuda.is_available() and use_cuda) else "cpu")
model_path = "/data/lujd/neoag_model/main_task/"
## gene
model1_filename = "TAPE_ft4/B_gene/main_finetune_plm_tape_B24_LR1e-05_seq_clip_fold4_ep10_221124.pkl"
## supertype
model2_filename = "TAPE_ft4/B_supertype_4201/main_finetune_plm_tape_B24_LR1e-05_seq_clip_fold4_ep9_230221.pkl"
## sequence
model3_filename = "TAPE_ft4/B_seq_more_4201/main_finetune_plm_tape_B26_LR6e-06_seq_clip_fold4_ep13_230223.pkl"
## semantic
model4_filename = "TAPE_ft4/B_semantic_equal_4201/main_finetune_plm_tape_B26_LR1e-05_seq_clip_fold4_ep18_230309.pkl"
model_names = [model1_filename,model2_filename,model3_filename,model4_filename]

print("Model preparing")
tokenizer = TAPETokenizer(vocab='iupac')
tape_config = ProteinBertConfig.from_pretrained('bert-base')
models = []
for model_name in model_names:
    model = meanTAPE(tape_config, "2mlp").to(device)
    model.load_state_dict(torch.load(model_path + model_name, map_location=device), strict = True)
    model = model.eval()
    models.append(model)
    print("load one model")
print("Model preparing done")

Model preparing
load one model
load one model
load one model
load one model
Model preparing done


In [24]:
given_HLA = "HLA-B*42:01"
init_peptide = "IFYYKEFL"
# init_peptide = "FRYNGLIHR"
# init_peptide = "FLGKIWPSHK"
# init_peptide = "KWIFLGLGLIL"
# init_peptide = "VWAPLILAYFPVF"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=13, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supplementary_file/HPL-Allele")

given HLA: HLA-B*42:01, given peptide: IFYYKEFL | binding porbability: 0.0026
Iteration-1, mutant_pool size: 19
source peptide: IFYYKEFL, mutated peptide: IFYYKESL | 7 F->S | binding probability: 0.9979
source peptide: IFYYKEFL, mutated peptide: IFYYKEGL | 7 F->G | binding probability: 0.9982
source peptide: IFYYKEFL, mutated peptide: IFYYKEAL | 7 F->A | binding probability: 0.9990
source peptide: IFYYKEFL, mutated peptide: IFYYKETL | 7 F->T | binding probability: 0.9995
source peptide: IFYYKEFL, mutated peptide: IFYYKEEL | 7 F->E | binding probability: 0.9998
Iteration-2, mutant_pool size: 95
source peptide: IFYYKEAL, mutated peptide: IFYTKEAL | 4 Y->T | binding probability: 1.0000
source peptide: IFYYKEAL, mutated peptide: IFYAKEAL | 4 Y->A | binding probability: 1.0000
source peptide: IFYYKEAL, mutated peptide: IFYQKEAL | 4 Y->Q | binding probability: 1.0000
source peptide: IFYYKEEL, mutated peptide: IFYVKEEL | 4 Y->V | binding probability: 1.0000
source peptide: IFYYKEEL, mutated p

## 2. HLA-A*11:02

In [12]:
# prepare model
use_cuda = True
device = torch.device("cuda:7" if (torch.cuda.is_available() and use_cuda) else "cpu")
model_path = "/data/lujd/neoag_model/main_task/"
## gene
model1_filename = "TAPE_ft4/A_gene/main_finetune_plm_tape_B24_LR1e-05_seq_clip_fold4_ep17_221123.pkl"
## supertype
model2_filename = "TAPE_ft4/A_supertype_1102/main_finetune_plm_tape_B24_LR1e-05_seq_clip_fold4_ep17_230222.pkl"
## sequence
model3_filename = "TAPE_ft4/A_seq_more_1102/main_finetune_plm_tape_B26_LR1e-05_seq_clip_fold4_ep12_230304.pkl"
## semantic
model4_filename = "TAPE_ft4/A_semantic_equal_1102/main_finetune_plm_tape_B26_LR1e-05_seq_clip_fold4_ep11_230309.pkl"
model_names = [model1_filename,model2_filename,model3_filename,model4_filename]

print("Model preparing")
tokenizer = TAPETokenizer(vocab='iupac')
tape_config = ProteinBertConfig.from_pretrained('bert-base')
models = []
for model_name in model_names:
    model = meanTAPE(tape_config, "2mlp").to(device)
    model.load_state_dict(torch.load(model_path + model_name, map_location=device), strict = True)
    model = model.eval()
    models.append(model)
    print("load one model")
print("Model preparing done")

Model preparing
load one model
load one model
load one model
load one model
Model preparing done


In [13]:
given_HLA = "HLA-A*11:02"
init_peptide = "DAVMEAKAK"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=9, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supplementary_file/HPL-Allele")

given HLA: HLA-A*11:02, given peptide: DAVMEAKAK | binding porbability: 0.1235
Iteration-1, mutant_pool size: 19
source peptide: DAVMEAKAK, mutated peptide: DAVREAKAK | 4 M->R | binding probability: 0.9949
source peptide: DAVMEAKAK, mutated peptide: DAVAEAKAK | 4 M->A | binding probability: 0.9985
source peptide: DAVMEAKAK, mutated peptide: DAVFEAKAK | 4 M->F | binding probability: 0.9985
source peptide: DAVMEAKAK, mutated peptide: DAVPEAKAK | 4 M->P | binding probability: 0.9993
source peptide: DAVMEAKAK, mutated peptide: DAVGEAKAK | 4 M->G | binding probability: 0.9998
Iteration-2, mutant_pool size: 95
source peptide: DAVPEAKAK, mutated peptide: AAVPEAKAK | 1 D->A | binding probability: 1.0000
source peptide: DAVGEAKAK, mutated peptide: DAVGEFKAK | 6 A->F | binding probability: 1.0000
source peptide: DAVPEAKAK, mutated peptide: HAVPEAKAK | 1 D->H | binding probability: 1.0000
source peptide: DAVGEAKAK, mutated peptide: DAVGELKAK | 6 A->L | binding probability: 1.0000
source peptide: 

## HLA-E*01:03

In [2]:
# prepare model
use_cuda = True
device = torch.device("cuda:7" if (torch.cuda.is_available() and use_cuda) else "cpu")
model_path = "/data/lujd/neoag_model/main_task/"
## sequence
model3_filename = "TAPE_ft4/E_seq_more_0103/main_finetune_plm_tape_B26_LR1e-05_seq_clip_fold4_ep15_230309.pkl"
## semantic
model4_filename = "TAPE_ft4/E_semantic_equal_0103/main_finetune_plm_tape_B26_LR1e-05_seq_clip_fold4_ep8_230309.pkl"
model_names = [model3_filename,model4_filename]

print("Model preparing")
tokenizer = TAPETokenizer(vocab='iupac')
tape_config = ProteinBertConfig.from_pretrained('bert-base')
models = []
for model_name in model_names:
    model = meanTAPE(tape_config, "2mlp").to(device)
    model.load_state_dict(torch.load(model_path + model_name, map_location=device), strict = True)
    model = model.eval()
    models.append(model)
    print("load one model")
print("Model preparing done")

Model preparing
load one model
load one model
Model preparing done


In [11]:
given_HLA = "HLA-E*01:03"
init_peptide = "DLPSRLGKI"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=10, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supplementary_file/HPL-Allele")

given HLA: HLA-E*01:03, given peptide: FLLPRGLAI | binding porbability: 0.0426
Iteration-1, mutant_pool size: 19
source peptide: FLLPRGLAI, mutated peptide: FLLPRWLAI | 6 G->W | binding probability: 0.9986
source peptide: FLLPRGLAI, mutated peptide: FLLPRLLAI | 6 G->L | binding probability: 0.9987
source peptide: FLLPRGLAI, mutated peptide: FLLPRVLAI | 6 G->V | binding probability: 0.9990
source peptide: FLLPRGLAI, mutated peptide: FLLPRQLAI | 6 G->Q | binding probability: 0.9998
source peptide: FLLPRGLAI, mutated peptide: FLLPRELAI | 6 G->E | binding probability: 0.9999
Iteration-2, mutant_pool size: 95
source peptide: FLLPRQLAI, mutated peptide: FLLPRQLLI | 8 A->L | binding probability: 1.0000
source peptide: FLLPRVLAI, mutated peptide: FLSPRVLAI | 3 L->S | binding probability: 1.0000
source peptide: FLLPRVLAI, mutated peptide: FLYPRVLAI | 3 L->Y | binding probability: 1.0000
source peptide: FLLPRVLAI, mutated peptide: FLAPRVLAI | 3 L->A | binding probability: 1.0000
source peptide: 

---
# HPL-Pan

In [3]:
# prepare model
use_cuda = True
device = torch.device("cuda:7" if (torch.cuda.is_available() and use_cuda) else "cpu")
model_path = "/data/lujd/neoag_model/main_task/"
model1_filename = "TAPE_ft/cat_mean_2mlp/main_finetune_plm_tape_B32_LR3e-05_seq_clip_fold4_ep51_221104.pkl"
model_names = [model1_filename]

print("Model preparing")
tokenizer = TAPETokenizer(vocab='iupac')
tape_config = ProteinBertConfig.from_pretrained('bert-base')
models = []
for model_name in model_names:
    model = meanTAPE(tape_config, "2mlp").to(device)
    model.load_state_dict(torch.load(model_path + model_name, map_location=device), strict = True)
    model = model.eval()
    models.append(model)
    print("load one model")
print("Model preparing done")

Model preparing
load one model
Model preparing done


## Zero-shot

In [5]:
given_HLA = "HLA-B*42:01"
# init_peptide = "RPGGKKKYK"
# init_peptide = "FRYNGLIHR"
# init_peptide = "FLGKIWPSHK"
init_peptide = "KWIFLGLGLIL"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                                                num_mutation=11, num_peptides=5, prob_limit=0.5,
                                                writein_file=True, algorithm="2a", filename="supplementary_file/HPL-Pan")

given HLA: HLA-B*42:01, given peptide: KWIFLGLGLIL | binding porbability: 0.0002
Iteration-1, mutant_pool size: 19
source peptide: KWIFLGLGLIL, mutated peptide: KKIFLGLGLIL | 2 W->K | binding probability: 0.4001
source peptide: KWIFLGLGLIL, mutated peptide: KTIFLGLGLIL | 2 W->T | binding probability: 0.6910
source peptide: KWIFLGLGLIL, mutated peptide: KRIFLGLGLIL | 2 W->R | binding probability: 0.7586
source peptide: KWIFLGLGLIL, mutated peptide: KEIFLGLGLIL | 2 W->E | binding probability: 0.9663
source peptide: KWIFLGLGLIL, mutated peptide: KPIFLGLGLIL | 2 W->P | binding probability: 0.9938
Iteration-2, mutant_pool size: 95
source peptide: KPIFLGLGLIL, mutated peptide: KPIDLGLGLIL | 4 F->D | binding probability: 0.9999
source peptide: KPIFLGLGLIL, mutated peptide: KPIHLGLGLIL | 4 F->H | binding probability: 1.0000
source peptide: KPIFLGLGLIL, mutated peptide: KPIQLGLGLIL | 4 F->Q | binding probability: 1.0000
source peptide: KPIFLGLGLIL, mutated peptide: KPIKLGLGLIL | 4 F->K | bindin

In [6]:
given_HLA = "HLA-A*11:02"
init_peptide = "DAVMEAKAK"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=9, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supplementary_file/HPL-Pan")

given HLA: HLA-A*11:02, given peptide: DAVMEAKAK | binding porbability: 0.0093
Iteration-1, mutant_pool size: 19
source peptide: DAVMEAKAK, mutated peptide: MAVMEAKAK | 1 D->M | binding probability: 0.9686
source peptide: DAVMEAKAK, mutated peptide: FAVMEAKAK | 1 D->F | binding probability: 0.9746
source peptide: DAVMEAKAK, mutated peptide: TAVMEAKAK | 1 D->T | binding probability: 0.9864
source peptide: DAVMEAKAK, mutated peptide: SAVMEAKAK | 1 D->S | binding probability: 0.9963
source peptide: DAVMEAKAK, mutated peptide: GAVMEAKAK | 1 D->G | binding probability: 0.9976
Iteration-2, mutant_pool size: 95
source peptide: GAVMEAKAK, mutated peptide: GAVMEAEAK | 7 K->E | binding probability: 0.9998
source peptide: GAVMEAKAK, mutated peptide: GAVMEAIAK | 7 K->I | binding probability: 0.9999
source peptide: SAVMEAKAK, mutated peptide: SAVMEAKEK | 8 A->E | binding probability: 0.9999
source peptide: TAVMEAKAK, mutated peptide: TAVMEAKEK | 8 A->E | binding probability: 0.9999
source peptide: 

In [7]:
given_HLA = "HLA-E*01:03"
init_peptide = "DLPSRLGKI"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=9, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supplementary_file/HPL-Pan")

given HLA: HLA-E*01:03, given peptide: DLPSRLGKI | binding porbability: 0.0000
Iteration-1, mutant_pool size: 19
source peptide: DLPSRLGKI, mutated peptide: DLPSRLHKI | 7 G->H | binding probability: 0.8489
source peptide: DLPSRLGKI, mutated peptide: DLPSRLNKI | 7 G->N | binding probability: 0.8597
source peptide: DLPSRLGKI, mutated peptide: DLPSRLVKI | 7 G->V | binding probability: 0.8701
source peptide: DLPSRLGKI, mutated peptide: DLPSRLIKI | 7 G->I | binding probability: 0.8945
source peptide: DLPSRLGKI, mutated peptide: DLPSRLLKI | 7 G->L | binding probability: 0.9271
Iteration-2, mutant_pool size: 95
source peptide: DLPSRLVKI, mutated peptide: DHPSRLVKI | 2 L->H | binding probability: 0.9994
source peptide: DLPSRLVKI, mutated peptide: DRPSRLVKI | 2 L->R | binding probability: 0.9994
source peptide: DLPSRLNKI, mutated peptide: SLPSRLNKI | 1 D->S | binding probability: 0.9995
source peptide: DLPSRLVKI, mutated peptide: DQPSRLVKI | 2 L->Q | binding probability: 0.9997
source peptide: 

## Common

In [19]:
given_HLA = "HLA-A*02:01"
# init_peptide = "MRDGGSATV"
# init_peptide = "EAARAALQG"
init_peptide = "DQTYNEMGD"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=9, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supplementary_file/HPL-Pan")

given HLA: HLA-A*02:01, given peptide: DQTYNEMGD | binding porbability: 0.0000
Iteration-1, mutant_pool size: 19
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGE | 9 D->E | binding probability: 0.0012
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGI | 9 D->I | binding probability: 0.0013
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGW | 9 D->W | binding probability: 0.0039
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGV | 9 D->V | binding probability: 0.0220
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGM | 9 D->M | binding probability: 0.0407
Iteration-2, mutant_pool size: 95
source peptide: DQTYNEMGV, mutated peptide: DQTYNMMGV | 6 E->M | binding probability: 0.8842
source peptide: DQTYNEMGM, mutated peptide: DQTYNFMGM | 6 E->F | binding probability: 0.9209
source peptide: DQTYNEMGV, mutated peptide: DQTYNLMGV | 6 E->L | binding probability: 0.9546
source peptide: DQTYNEMGE, mutated peptide: DQTKNEMGE | 4 Y->K | binding probability: 0.9572
source peptide: 

In [20]:
given_HLA = "HLA-A*02:02"
init_peptide = "ALPPRAYAM"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=9, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supplementary_file/HPL-Pan")

given HLA: HLA-A*02:02, given peptide: ALPPRAYAM | binding porbability: 0.0009
Iteration-1, mutant_pool size: 19
source peptide: ALPPRAYAM, mutated peptide: ALPPRANAM | 7 Y->N | binding probability: 0.8687
source peptide: ALPPRAYAM, mutated peptide: ALPPRADAM | 7 Y->D | binding probability: 0.8749
source peptide: ALPPRAYAM, mutated peptide: ALPPRAVAM | 7 Y->V | binding probability: 0.8986
source peptide: ALPPRAYAM, mutated peptide: ALPPRAIAM | 7 Y->I | binding probability: 0.9219
source peptide: ALPPRAYAM, mutated peptide: ALPPRAEAM | 7 Y->E | binding probability: 0.9328
Iteration-2, mutant_pool size: 95
source peptide: ALPPRAIAM, mutated peptide: ALPPPAIAM | 5 R->P | binding probability: 0.9987
source peptide: ALPPRAVAM, mutated peptide: ALFPRAVAM | 3 P->F | binding probability: 0.9987
source peptide: ALPPRAIAM, mutated peptide: ALPPLAIAM | 5 R->L | binding probability: 0.9988
source peptide: ALPPRAVAM, mutated peptide: ALYPRAVAM | 3 P->Y | binding probability: 0.9991
source peptide: 

In [34]:
given_HLA = "HLA-B*40:01"
# init_peptide = "KIRLRPGGK"
# init_peptide = "ADYSVLYNST"
init_peptide = "NSSKVSQNY"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=14, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supplementary_file/HPL-Pan")

given HLA: HLA-B*40:01, given peptide: NSSKVSQNY | binding porbability: 0.0001
Iteration-1, mutant_pool size: 19
source peptide: NSSKVSQNY, mutated peptide: NSSKVYQNY | 6 S->Y | binding probability: 0.0001
source peptide: NSSKVSQNY, mutated peptide: NSSKVMQNY | 6 S->M | binding probability: 0.0002
source peptide: NSSKVSQNY, mutated peptide: NSSKVFQNY | 6 S->F | binding probability: 0.0002
source peptide: NSSKVSQNY, mutated peptide: NSSKVAQNY | 6 S->A | binding probability: 0.0004
source peptide: NSSKVSQNY, mutated peptide: NSSKVGQNY | 6 S->G | binding probability: 0.0005
Iteration-2, mutant_pool size: 95
source peptide: NSSKVMQNY, mutated peptide: NSSKVMQHY | 8 N->H | binding probability: 0.1402
source peptide: NSSKVGQNY, mutated peptide: NSSKVGQQY | 8 N->Q | binding probability: 0.1490
source peptide: NSSKVMQNY, mutated peptide: NSSKVMQYY | 8 N->Y | binding probability: 0.2154
source peptide: NSSKVMQNY, mutated peptide: NSSKVMQFY | 8 N->F | binding probability: 0.2365
source peptide: 

In [41]:
given_HLA = "HLA-C*01:02"
init_peptide = "VMAYDRYLA"
init_peptide = "SLTAADIGR"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=11, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supplementary_file/HPL-Pan")

given HLA: HLA-C*01:02, given peptide: SLTAADIGR | binding porbability: 0.0000
Iteration-1, mutant_pool size: 19
source peptide: SLTAADIGR, mutated peptide: SLMAADIGR | 3 T->M | binding probability: 0.0007
source peptide: SLTAADIGR, mutated peptide: SLGAADIGR | 3 T->G | binding probability: 0.0007
source peptide: SLTAADIGR, mutated peptide: SLPAADIGR | 3 T->P | binding probability: 0.0029
source peptide: SLTAADIGR, mutated peptide: SLEAADIGR | 3 T->E | binding probability: 0.0101
source peptide: SLTAADIGR, mutated peptide: SLDAADIGR | 3 T->D | binding probability: 0.8731
Iteration-2, mutant_pool size: 95
source peptide: SLEAADIGR, mutated peptide: SLEAADIAR | 8 G->A | binding probability: 0.9714
source peptide: SLEAADIGR, mutated peptide: SLEAADITR | 8 G->T | binding probability: 0.9816
source peptide: SLDAADIGR, mutated peptide: YLDAADIGR | 1 S->Y | binding probability: 0.9857
source peptide: SLPAADIGR, mutated peptide: SLPAADIGM | 9 R->M | binding probability: 0.9962
source peptide: 