In [1]:
import sys 
if ".." not in sys.path:
    sys.path.append("..")

import torch
from mutation_release import get_mutated_peptides
from tape import TAPETokenizer, ProteinBertConfig
from HPL.model_ft import meanTAPE

---
# HPL-Allele
## 1. HLA-B*42:01

In [2]:
# prepare model
use_cuda = True
device = torch.device("cuda:5" if (torch.cuda.is_available() and use_cuda) else "cpu")
model_path = "/data/lujd/neoag_model/main_task/"
## gene
model1_filename = "HPL-Cluster/B_gene/main_finetune_plm_tape_B24_LR1e-05_seq_clip_fold4_ep10_221124.pkl"
## supertype
model2_filename = "HPL-Cluster/B_supertype_4201/main_finetune_plm_tape_B24_LR1e-05_seq_clip_fold4_ep9_230221.pkl"
## sequence
model3_filename = "HPL-Cluster/B_seq_more_4201/main_finetune_plm_tape_B26_LR6e-06_seq_clip_fold4_ep13_230223.pkl"
## semantic
model4_filename = "HPL-Cluster/B_semantic_equal_4201/main_finetune_plm_tape_B26_LR1e-05_seq_clip_fold4_ep18_230309.pkl"
model_names = [model1_filename,model2_filename,model3_filename,model4_filename]

print("Model preparing")
tokenizer = TAPETokenizer(vocab='iupac')
tape_config = ProteinBertConfig.from_pretrained('bert-base')
models = []
for model_name in model_names:
    model = meanTAPE(tape_config, "2mlp").to(device)
    model.load_state_dict(torch.load(model_path + model_name, map_location=device), strict = True)
    model = model.eval()
    models.append(model)
    print("load one model")
print("Model preparing done")

Model preparing
load one model
load one model
load one model
load one model
Model preparing done


In [5]:
given_HLA = "HLA-B*42:01"
# init_peptide = "PLRPMTYR"       # IFYYKEFL
init_peptide = "RTSKAALER"        # FRYNGLIHR KYRLKHIVW
# init_peptide = "FLGKIWPSHK"
# init_peptide = "KWIFLGLGLIL"
# init_peptide = "VWAPLILAYFPVF"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=9, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supp_case_file/HPL-Allele")

given HLA: HLA-B*42:01, given peptide: RTSKAALER | binding porbability: 0.0000
Iteration-1, mutant_pool size: 19
source peptide: RTSKAALER, mutated peptide: RTSKAALEM | 9 R->M | binding probability: 0.0002
source peptide: RTSKAALER, mutated peptide: RTSKAALEF | 9 R->F | binding probability: 0.0005
source peptide: RTSKAALER, mutated peptide: RTSKAALEI | 9 R->I | binding probability: 0.0008
source peptide: RTSKAALER, mutated peptide: RTSKAALEV | 9 R->V | binding probability: 0.0015
source peptide: RTSKAALER, mutated peptide: RTSKAALEL | 9 R->L | binding probability: 0.0025
Iteration-2, mutant_pool size: 95
source peptide: RTSKAALEL, mutated peptide: RTSKRALEL | 5 A->R | binding probability: 0.9967
source peptide: RTSKAALEI, mutated peptide: RTSKQALEI | 5 A->Q | binding probability: 0.9992
source peptide: RTSKAALEI, mutated peptide: RTSKRALEI | 5 A->R | binding probability: 0.9994
source peptide: RTSKAALEL, mutated peptide: RTSKKALEL | 5 A->K | binding probability: 0.9998
source peptide: 

## 2. HLA-A*11:02

In [6]:
# prepare model
use_cuda = True
device = torch.device("cuda:4" if (torch.cuda.is_available() and use_cuda) else "cpu")
model_path = "/data/lujd/neoag_model/main_task/"
## gene
model1_filename = "HPL-Cluster/A_gene/main_finetune_plm_tape_B24_LR1e-05_seq_clip_fold4_ep17_221123.pkl"
## supertype
model2_filename = "HPL-Cluster/A_supertype_1102/main_finetune_plm_tape_B24_LR1e-05_seq_clip_fold4_ep17_230222.pkl"
## sequence
model3_filename = "HPL-Cluster/A_seq_more_1102/main_finetune_plm_tape_B26_LR1e-05_seq_clip_fold4_ep12_230304.pkl"
## semantic
model4_filename = "HPL-Cluster/A_semantic_equal_1102/main_finetune_plm_tape_B26_LR1e-05_seq_clip_fold4_ep11_230309.pkl"
model_names = [model1_filename,model2_filename,model3_filename,model4_filename]

print("Model preparing")
tokenizer = TAPETokenizer(vocab='iupac')
tape_config = ProteinBertConfig.from_pretrained('bert-base')
models = []
for model_name in model_names:
    model = meanTAPE(tape_config, "2mlp").to(device)
    model.load_state_dict(torch.load(model_path + model_name, map_location=device), strict = True)
    model = model.eval()
    models.append(model)
    print("load one model")
print("Model preparing done")

Model preparing
load one model
load one model
load one model
load one model
Model preparing done


In [7]:
given_HLA = "HLA-A*11:02"
# init_peptide = "DAVMEAKAK"
init_peptide = "EILKRAMEL"

output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=9, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supp_case_file/HPL-Allele")

given HLA: HLA-A*11:02, given peptide: EILKRAMEL | binding porbability: 0.0000
Iteration-1, mutant_pool size: 19
source peptide: EILKRAMEL, mutated peptide: EILKRAMEY | 9 L->Y | binding probability: 0.0029
source peptide: EILKRAMEL, mutated peptide: EILKRAMEQ | 9 L->Q | binding probability: 0.0033
source peptide: EILKRAMEL, mutated peptide: EILKRAMEH | 9 L->H | binding probability: 0.0040
source peptide: EILKRAMEL, mutated peptide: EILKRAMER | 9 L->R | binding probability: 0.2842
source peptide: EILKRAMEL, mutated peptide: EILKRAMEK | 9 L->K | binding probability: 0.9841
Iteration-2, mutant_pool size: 95
source peptide: EILKRAMEK, mutated peptide: GILKRAMEK | 1 E->G | binding probability: 0.9989
source peptide: EILKRAMEY, mutated peptide: AILKRAMEY | 1 E->A | binding probability: 0.9989
source peptide: EILKRAMEK, mutated peptide: SILKRAMEK | 1 E->S | binding probability: 0.9991
source peptide: EILKRAMEK, mutated peptide: RILKRAMEK | 1 E->R | binding probability: 0.9992
source peptide: 

## HLA-E*01:03

In [8]:
# prepare model
use_cuda = True
device = torch.device("cuda:4" if (torch.cuda.is_available() and use_cuda) else "cpu")
model_path = "/data/lujd/neoag_model/main_task/"
## sequence
model3_filename = "HPL-Cluster/E_seq_more_0103/main_finetune_plm_tape_B26_LR1e-05_seq_clip_fold4_ep15_230309.pkl"
## semantic
model4_filename = "HPL-Cluster/E_semantic_equal_0103/main_finetune_plm_tape_B26_LR1e-05_seq_clip_fold4_ep8_230309.pkl"
model_names = [model3_filename,model4_filename]

print("Model preparing")
tokenizer = TAPETokenizer(vocab='iupac')
tape_config = ProteinBertConfig.from_pretrained('bert-base')
models = []
for model_name in model_names:
    model = meanTAPE(tape_config, "2mlp").to(device)
    model.load_state_dict(torch.load(model_path + model_name, map_location=device), strict = True)
    model = model.eval()
    models.append(model)
    print("load one model")
print("Model preparing done")

Model preparing
load one model
load one model
Model preparing done


In [18]:
given_HLA = "HLA-E*01:03"
# init_peptide = "DLPSRLGKI"    # GLDARAYRL, GTFKNAYFID, FMTRLGPLL, HMAQTLGSL
init_peptide = "ARAGGAGTG"

output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=9, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supp_case_file/HPL-Allele")

given HLA: HLA-E*01:03, given peptide: ARAGGAGTG | binding porbability: 0.0000
Iteration-1, mutant_pool size: 19
source peptide: ARAGGAGTG, mutated peptide: ARAGGAGTY | 9 G->Y | binding probability: 0.0017
source peptide: ARAGGAGTG, mutated peptide: ARAGGAGTI | 9 G->I | binding probability: 0.0027
source peptide: ARAGGAGTG, mutated peptide: ARAGGAGTM | 9 G->M | binding probability: 0.5948
source peptide: ARAGGAGTG, mutated peptide: ARAGGAGTF | 9 G->F | binding probability: 0.9409
source peptide: ARAGGAGTG, mutated peptide: ARAGGAGTL | 9 G->L | binding probability: 0.9899
Iteration-2, mutant_pool size: 95
source peptide: ARAGGAGTM, mutated peptide: ARAGGAFTM | 7 G->F | binding probability: 0.9993
source peptide: ARAGGAGTL, mutated peptide: ARAPGAGTL | 4 G->P | binding probability: 0.9994
source peptide: ARAGGAGTL, mutated peptide: ARAQGAGTL | 4 G->Q | binding probability: 0.9994
source peptide: ARAGGAGTI, mutated peptide: ARAGGAFTI | 7 G->F | binding probability: 0.9995
source peptide: 

---
# HPL-Pan

In [19]:
# prepare model
use_cuda = True
device = torch.device("cuda:4" if (torch.cuda.is_available() and use_cuda) else "cpu")
model_path = "/data/lujd/neoag_model/main_task/"
model1_filename = "HPL-Pan/cat_mean_2mlp/main_finetune_plm_tape_B32_LR3e-05_seq_clip_fold4_ep51_221104.pkl"
model_names = [model1_filename]

print("Model preparing")
tokenizer = TAPETokenizer(vocab='iupac')
tape_config = ProteinBertConfig.from_pretrained('bert-base')
models = []
for model_name in model_names:
    model = meanTAPE(tape_config, "2mlp").to(device)
    model.load_state_dict(torch.load(model_path + model_name, map_location=device), strict = True)
    model = model.eval()
    models.append(model)
    print("load one model")
print("Model preparing done")

Model preparing
load one model
Model preparing done


## Zero-shot

In [20]:
given_HLA = "HLA-B*42:01"
# init_peptide = "RPGGKKKYK"
init_peptide = "RTSKAALER"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                                                num_mutation=9, num_peptides=5, prob_limit=0.5,
                                                writein_file=True, algorithm="2a", filename="supp_case_file/HPL-Pan")

given HLA: HLA-B*42:01, given peptide: RTSKAALER | binding porbability: 0.0000
Iteration-1, mutant_pool size: 19
source peptide: RTSKAALER, mutated peptide: RTSKAALEF | 9 R->F | binding probability: 0.0006
source peptide: RTSKAALER, mutated peptide: RTSKAALEM | 9 R->M | binding probability: 0.0044
source peptide: RTSKAALER, mutated peptide: RTSKAALEI | 9 R->I | binding probability: 0.0111
source peptide: RTSKAALER, mutated peptide: RTSKAALEV | 9 R->V | binding probability: 0.0111
source peptide: RTSKAALER, mutated peptide: RTSKAALEL | 9 R->L | binding probability: 0.0264
Iteration-2, mutant_pool size: 95
source peptide: RTSKAALEI, mutated peptide: RTSKQALEI | 5 A->Q | binding probability: 0.9963
source peptide: RTSKAALEM, mutated peptide: RTIKAALEM | 3 S->I | binding probability: 0.9965
source peptide: RTSKAALEI, mutated peptide: RTSKRALEI | 5 A->R | binding probability: 0.9980
source peptide: RTSKAALEL, mutated peptide: RTSKKALEL | 5 A->K | binding probability: 0.9995
source peptide: 

In [6]:
given_HLA = "HLA-A*11:02"
init_peptide = "DAVMEAKAK"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=9, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supp_case_file/HPL-Pan")

given HLA: HLA-A*11:02, given peptide: DAVMEAKAK | binding porbability: 0.0093
Iteration-1, mutant_pool size: 19
source peptide: DAVMEAKAK, mutated peptide: MAVMEAKAK | 1 D->M | binding probability: 0.9686
source peptide: DAVMEAKAK, mutated peptide: FAVMEAKAK | 1 D->F | binding probability: 0.9746
source peptide: DAVMEAKAK, mutated peptide: TAVMEAKAK | 1 D->T | binding probability: 0.9864
source peptide: DAVMEAKAK, mutated peptide: SAVMEAKAK | 1 D->S | binding probability: 0.9963
source peptide: DAVMEAKAK, mutated peptide: GAVMEAKAK | 1 D->G | binding probability: 0.9976
Iteration-2, mutant_pool size: 95
source peptide: GAVMEAKAK, mutated peptide: GAVMEAEAK | 7 K->E | binding probability: 0.9998
source peptide: GAVMEAKAK, mutated peptide: GAVMEAIAK | 7 K->I | binding probability: 0.9999
source peptide: SAVMEAKAK, mutated peptide: SAVMEAKEK | 8 A->E | binding probability: 0.9999
source peptide: TAVMEAKAK, mutated peptide: TAVMEAKEK | 8 A->E | binding probability: 0.9999
source peptide: 

In [7]:
given_HLA = "HLA-E*01:03"
init_peptide = "DLPSRLGKI"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=9, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supp_case_file/HPL-Pan")

given HLA: HLA-E*01:03, given peptide: DLPSRLGKI | binding porbability: 0.0000
Iteration-1, mutant_pool size: 19
source peptide: DLPSRLGKI, mutated peptide: DLPSRLHKI | 7 G->H | binding probability: 0.8489
source peptide: DLPSRLGKI, mutated peptide: DLPSRLNKI | 7 G->N | binding probability: 0.8597
source peptide: DLPSRLGKI, mutated peptide: DLPSRLVKI | 7 G->V | binding probability: 0.8701
source peptide: DLPSRLGKI, mutated peptide: DLPSRLIKI | 7 G->I | binding probability: 0.8945
source peptide: DLPSRLGKI, mutated peptide: DLPSRLLKI | 7 G->L | binding probability: 0.9271
Iteration-2, mutant_pool size: 95
source peptide: DLPSRLVKI, mutated peptide: DHPSRLVKI | 2 L->H | binding probability: 0.9994
source peptide: DLPSRLVKI, mutated peptide: DRPSRLVKI | 2 L->R | binding probability: 0.9994
source peptide: DLPSRLNKI, mutated peptide: SLPSRLNKI | 1 D->S | binding probability: 0.9995
source peptide: DLPSRLVKI, mutated peptide: DQPSRLVKI | 2 L->Q | binding probability: 0.9997
source peptide: 

## Common

In [30]:
given_HLA = "HLA-A*02:01"
init_peptide = "YKSRCYVGL"      # DQTYNEMGD
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=9, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supp_case_file/HPL-Pan")

given HLA: HLA-A*02:01, given peptide: YKSRCYVGL | binding porbability: 0.0000
Iteration-1, mutant_pool size: 19
source peptide: YKSRCYVGL, mutated peptide: YQSRCYVGL | 2 K->Q | binding probability: 0.1054
source peptide: YKSRCYVGL, mutated peptide: YVSRCYVGL | 2 K->V | binding probability: 0.5198
source peptide: YKSRCYVGL, mutated peptide: YMSRCYVGL | 2 K->M | binding probability: 0.8098
source peptide: YKSRCYVGL, mutated peptide: YISRCYVGL | 2 K->I | binding probability: 0.9649
source peptide: YKSRCYVGL, mutated peptide: YLSRCYVGL | 2 K->L | binding probability: 0.9872
Iteration-2, mutant_pool size: 95
source peptide: YLSRCYVGL, mutated peptide: YLSRCYVYL | 8 G->Y | binding probability: 0.9987
source peptide: YLSRCYVGL, mutated peptide: YLSRCYVLL | 8 G->L | binding probability: 0.9988
source peptide: YVSRCYVGL, mutated peptide: YVSECYVGL | 4 R->E | binding probability: 0.9990
source peptide: YLSRCYVGL, mutated peptide: YLSRCYVFL | 8 G->F | binding probability: 0.9994
source peptide: 

In [27]:
given_HLA = "HLA-A*03:01"
init_peptide = "ELNKGWFGA"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=9, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supp_case_file/HPL-Pan")

given HLA: HLA-A*03:01, given peptide: ELNKGWFGA | binding porbability: 0.0000
Iteration-1, mutant_pool size: 19
source peptide: ELNKGWFGA, mutated peptide: ELNKGWFGL | 9 A->L | binding probability: 0.0054
source peptide: ELNKGWFGA, mutated peptide: ELNKGWFGV | 9 A->V | binding probability: 0.0101
source peptide: ELNKGWFGA, mutated peptide: ELNKGWFGM | 9 A->M | binding probability: 0.0250
source peptide: ELNKGWFGA, mutated peptide: ELNKGWFGY | 9 A->Y | binding probability: 0.1231
source peptide: ELNKGWFGA, mutated peptide: ELNKGWFGK | 9 A->K | binding probability: 0.8131
Iteration-2, mutant_pool size: 95
source peptide: ELNKGWFGK, mutated peptide: ELNKGDFGK | 6 W->D | binding probability: 0.9992
source peptide: ELNKGWFGK, mutated peptide: ELNKGLFGK | 6 W->L | binding probability: 0.9995
source peptide: ELNKGWFGK, mutated peptide: ELNKGQFGK | 6 W->Q | binding probability: 0.9997
source peptide: ELNKGWFGK, mutated peptide: ELNKGIFGK | 6 W->I | binding probability: 0.9998
source peptide: 

In [21]:
given_HLA = "HLA-A*02:02"
init_peptide = "ALPPRAYAM"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=9, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supp_case_file/HPL-Pan")

given HLA: HLA-A*02:02, given peptide: ALPPRAYAM | binding porbability: 0.0009
Iteration-1, mutant_pool size: 19
source peptide: ALPPRAYAM, mutated peptide: ALPPRANAM | 7 Y->N | binding probability: 0.8687
source peptide: ALPPRAYAM, mutated peptide: ALPPRADAM | 7 Y->D | binding probability: 0.8749
source peptide: ALPPRAYAM, mutated peptide: ALPPRAVAM | 7 Y->V | binding probability: 0.8986
source peptide: ALPPRAYAM, mutated peptide: ALPPRAIAM | 7 Y->I | binding probability: 0.9219
source peptide: ALPPRAYAM, mutated peptide: ALPPRAEAM | 7 Y->E | binding probability: 0.9328
Iteration-2, mutant_pool size: 95
source peptide: ALPPRAIAM, mutated peptide: ALPPPAIAM | 5 R->P | binding probability: 0.9987
source peptide: ALPPRAVAM, mutated peptide: ALFPRAVAM | 3 P->F | binding probability: 0.9987
source peptide: ALPPRAIAM, mutated peptide: ALPPLAIAM | 5 R->L | binding probability: 0.9988
source peptide: ALPPRAVAM, mutated peptide: ALYPRAVAM | 3 P->Y | binding probability: 0.9991
source peptide: 

In [34]:
given_HLA = "HLA-B*40:01"
# init_peptide = "KIRLRPGGK"
# init_peptide = "ADYSVLYNST"
init_peptide = "NSSKVSQNY"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=14, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supp_case_file/HPL-Pan")

given HLA: HLA-B*40:01, given peptide: NSSKVSQNY | binding porbability: 0.0001
Iteration-1, mutant_pool size: 19
source peptide: NSSKVSQNY, mutated peptide: NSSKVYQNY | 6 S->Y | binding probability: 0.0001
source peptide: NSSKVSQNY, mutated peptide: NSSKVMQNY | 6 S->M | binding probability: 0.0002
source peptide: NSSKVSQNY, mutated peptide: NSSKVFQNY | 6 S->F | binding probability: 0.0002
source peptide: NSSKVSQNY, mutated peptide: NSSKVAQNY | 6 S->A | binding probability: 0.0004
source peptide: NSSKVSQNY, mutated peptide: NSSKVGQNY | 6 S->G | binding probability: 0.0005
Iteration-2, mutant_pool size: 95
source peptide: NSSKVMQNY, mutated peptide: NSSKVMQHY | 8 N->H | binding probability: 0.1402
source peptide: NSSKVGQNY, mutated peptide: NSSKVGQQY | 8 N->Q | binding probability: 0.1490
source peptide: NSSKVMQNY, mutated peptide: NSSKVMQYY | 8 N->Y | binding probability: 0.2154
source peptide: NSSKVMQNY, mutated peptide: NSSKVMQFY | 8 N->F | binding probability: 0.2365
source peptide: 

In [41]:
given_HLA = "HLA-C*01:02"
init_peptide = "VMAYDRYLA"
init_peptide = "SLTAADIGR"
output_pos, output_pep = get_mutated_peptides(given_HLA, init_peptide, tokenizer, models, device,
                        num_mutation=11, num_peptides=5, prob_limit=0.5,
                        writein_file=True, algorithm="2a", filename="supp_case_file/HPL-Pan")

given HLA: HLA-C*01:02, given peptide: SLTAADIGR | binding porbability: 0.0000
Iteration-1, mutant_pool size: 19
source peptide: SLTAADIGR, mutated peptide: SLMAADIGR | 3 T->M | binding probability: 0.0007
source peptide: SLTAADIGR, mutated peptide: SLGAADIGR | 3 T->G | binding probability: 0.0007
source peptide: SLTAADIGR, mutated peptide: SLPAADIGR | 3 T->P | binding probability: 0.0029
source peptide: SLTAADIGR, mutated peptide: SLEAADIGR | 3 T->E | binding probability: 0.0101
source peptide: SLTAADIGR, mutated peptide: SLDAADIGR | 3 T->D | binding probability: 0.8731
Iteration-2, mutant_pool size: 95
source peptide: SLEAADIGR, mutated peptide: SLEAADIAR | 8 G->A | binding probability: 0.9714
source peptide: SLEAADIGR, mutated peptide: SLEAADITR | 8 G->T | binding probability: 0.9816
source peptide: SLDAADIGR, mutated peptide: YLDAADIGR | 1 S->Y | binding probability: 0.9857
source peptide: SLPAADIGR, mutated peptide: SLPAADIGM | 9 R->M | binding probability: 0.9962
source peptide: 