Input:
- HLA-A*02:02, TIQQCQSPT
- HLA-A*02:01, DQTYNEMGD

---
# algorithm 1a -> HLA-A*02:02

In [1]:
import torch
from mutation import get_mutated_peptides

use_cuda = True
device = torch.device("cuda:0" if (torch.cuda.is_available() and use_cuda) else "cpu")

given_HLA = "HLA-A*02:02"
init_peptide = "TIQQCQSPT"

mutant_peptides = get_mutated_peptides(
                                        given_HLA, init_peptide, device,
                                        num_mutation=5, num_peptides=5, algorithm="1a"
                                        )
# When iteration is 4, GPU(3090) takes 15s while CPU takes 300s.
# batch=64, 3000M

HLA_seq_dict preparing
Model preparing
given HLA: HLA-A*02:02, given peptide: TIQQCQSPT | binding porbability: 0.0035
************** Run algorithm-1a **************
Iteration-1, mutant_pool size: 171
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSNT | 8 P->N | binding probability: 0.4029
source peptide: TIQQCQSPT, mutated peptide: TIQQIQSPT | 5 C->I | binding probability: 0.6952
source peptide: TIQQCQSPT, mutated peptide: TIQNCQSPT | 4 Q->N | binding probability: 0.7288
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSYT | 8 P->Y | binding probability: 0.7901
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSFT | 8 P->F | binding probability: 0.8279
Iteration-2, mutant_pool size: 753
source peptide: TIQQCQSFT, mutated peptide: TIQQMQSFT | 5 C->M | binding probability: 0.9941
source peptide: TIQQCQSYT, mutated peptide: TIQQVQSYT | 5 C->V | binding probability: 0.9956
source peptide: TIQQIQSPT, mutated peptide: TIQQIQSVT | 8 P->V | binding probability: 0.9956
source peptide: TIQQC

In [2]:
given_HLA = "HLA-A*02:02"

import pandas as pd
data_path = "/data/lujd/neoag_data/main_task/"
train_data_raw = pd.read_csv(
            data_path+'train_data_fold4.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
val_data_raw = pd.read_csv(
            data_path+'val_data_fold4.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
test_data_raw = pd.read_csv(
            data_path+'independent_set.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
external_data_raw = pd.read_csv(
            data_path+'external_set.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
merged_data = pd.concat([train_data_raw, val_data_raw, test_data_raw, external_data_raw], axis=0)
target_data = merged_data[merged_data.HLA==given_HLA]
pos_peptides = target_data[target_data.label==1].peptide.to_list()
neg_peptides = target_data[target_data.label==0].peptide.to_list()
print(f"target/all: {len(target_data)}/{len(merged_data)}")
print(f"pos/target: {len(pos_peptides)}/{len(target_data)}")
print(f"neg/target: {len(neg_peptides)}/{len(target_data)}")

target/all: 5011/924412
pos/target: 2507/5011
neg/target: 2504/5011


In [3]:
for mutant_peptide in mutant_peptides:
    if mutant_peptide in pos_peptides:
        print(f"{mutant_peptide} is in our positive dataset")
    elif mutant_peptide in neg_peptides:
        print(f"{mutant_peptide} is in our negative dataset")
    else:
        print(f"{mutant_peptide} isn't in our dataset")

TIQQCQSNT isn't in our dataset
TIQQIQSPT isn't in our dataset
TIQNCQSPT isn't in our dataset
TIQQCQSYT isn't in our dataset
TIQQCQSFT isn't in our dataset
TIQQMQSFT isn't in our dataset
TIQQVQSYT isn't in our dataset
TIQQIQSVT isn't in our dataset
TIQQLQSFT isn't in our dataset
TIQQLQSYT isn't in our dataset
TIQQLQSFV isn't in our dataset
LIQQLQSYT isn't in our dataset
TIAQIQSVT isn't in our dataset
IIQQLQSYT isn't in our dataset
TISQIQSVT isn't in our dataset
TISHIQSVT isn't in our dataset
TIAQIQSVL isn't in our dataset
TIAQIQSVI isn't in our dataset
TISQLQSFV isn't in our dataset
IISQIQSVT isn't in our dataset
TIAPIQSVL isn't in our dataset
TIAEIQSVL isn't in our dataset
TIAQIVSVL isn't in our dataset
TISHIQSVV isn't in our dataset
TISHLQSFV isn't in our dataset


In [4]:
import numpy as np
import torch
import torch.nn as nn
from TransPHLA_model import Transformer

hla_seq_dict = pd.read_csv(
                        data_path+"HLA_sequence_dict_ABCEG.csv", index_col=0
                        ).set_index(["HLA_name"])["short"].to_dict()
HLA_seq = hla_seq_dict[given_HLA]
vocab = np.load(
            data_path+"vocab_dict.npy", allow_pickle=True
            ).item()

model_path = "/data/lujd/neoag_model/main_task/"
model_name = "TransPHLA/TransPHLA_official_model.pkl"
model_eval = Transformer()
model_eval.load_state_dict(torch.load(model_path+model_name), strict = True)
model_eval.eval()

hla_max_len = 34
pep_max_len = 15
hla_token = [vocab[n] for n in HLA_seq.ljust(hla_max_len, "-")]
hla_token = torch.LongTensor([hla_token])

# initial peptide
pep_token = [vocab[n] for n in init_peptide.ljust(pep_max_len, "-")]
pep_token = torch.LongTensor([pep_token])
val_outputs, _, _, _ = model_eval(pep_token, hla_token)
y_prob_val = nn.Softmax(dim=1)(val_outputs)[:, 1].cpu().detach().numpy()
print("{}\tTransPHLA report: given HLA {}, given peptide {} | binding porbability = {:.4f}".format(1, given_HLA, init_peptide, y_prob_val.item()))

# mutate peptides
for order, mutant_peptide in enumerate(mutant_peptides):
    pep_token = [vocab[n] for n in mutant_peptide.ljust(pep_max_len, "-")]
    pep_token = torch.LongTensor([pep_token])
    val_outputs, _, _, _ = model_eval(pep_token, hla_token)
    y_prob_val = nn.Softmax(dim=1)(val_outputs)[:, 1].cpu().detach().numpy()
    print("{}\tTransPHLA report: given HLA {}, given peptide {} | binding porbability = {:.4f}".format(order+2, given_HLA, mutant_peptide, y_prob_val.item()))

1	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSPT | binding porbability = 0.0032
2	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSNT | binding porbability = 0.0008
3	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQIQSPT | binding porbability = 0.0002
4	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQNCQSPT | binding porbability = 0.0020
5	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSYT | binding porbability = 0.0001
6	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSFT | binding porbability = 0.0001
7	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQMQSFT | binding porbability = 0.0004
8	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQVQSYT | binding porbability = 0.0000
9	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQIQSVT | binding porbability = 0.0000
10	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQLQSFT | binding porbability = 0.0000
11	TransPHLA report

# algorithm 1b -> HLA-A*02:02

In [1]:
import torch
from mutation import get_mutated_peptides

use_cuda = True
device = torch.device("cuda:0" if (torch.cuda.is_available() and use_cuda) else "cpu")

given_HLA = "HLA-A*02:02"
init_peptide = "TIQQCQSPT"

mutant_peptides = get_mutated_peptides(
                                        given_HLA, init_peptide, device,
                                        num_mutation=5, num_peptides=5, algorithm="1b"
                                        )
# When iteration is 4, GPU(3090) takes 15s while CPU takes 300s.
# batch=64, 3000M

HLA_seq_dict preparing
Model preparing
given HLA: HLA-A*02:02, given peptide: TIQQCQSPT | binding porbability: 0.0035
************** Run algorithm-1b **************
Iteration-1, mutant_pool size: 171
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSFT | 8 P->F | binding probability: 0.8279
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSYT | 8 P->Y | binding probability: 0.7901
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSNT | 8 P->N | binding probability: 0.4029
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSWT | 8 P->W | binding probability: 0.1483
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSQT | 8 P->Q | binding probability: 0.1375
Iteration-2, mutant_pool size: 760
source peptide: TIQQCQSFT, mutated peptide: TIQQLQSFT | 5 C->L | binding probability: 0.9961
source peptide: TIQQCQSFT, mutated peptide: TIQQMQSFT | 5 C->M | binding probability: 0.9941
source peptide: TIQQCQSFT, mutated peptide: TIQQNQSFT | 5 C->N | binding probability: 0.9934
source peptide: TIQQC

In [2]:
import pandas as pd
data_path = "/data/lujd/neoag_data/main_task/"
train_data_raw = pd.read_csv(
            data_path+'train_data_fold4.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
val_data_raw = pd.read_csv(
            data_path+'val_data_fold4.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
test_data_raw = pd.read_csv(
            data_path+'independent_set.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
external_data_raw = pd.read_csv(
            data_path+'external_set.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
merged_data = pd.concat([train_data_raw, val_data_raw, test_data_raw, external_data_raw], axis=0)
target_data = merged_data[merged_data.HLA==given_HLA]
pos_peptides = target_data[target_data.label==1].peptide.to_list()
neg_peptides = target_data[target_data.label==0].peptide.to_list()
print(f"target/all: {len(target_data)}/{len(merged_data)}")
print(f"pos/target: {len(pos_peptides)}/{len(target_data)}")
print(f"neg/target: {len(neg_peptides)}/{len(target_data)}")

for mutant_peptide in mutant_peptides:
    if mutant_peptide in pos_peptides:
        print(f"{mutant_peptide} is in our positive dataset")
    elif mutant_peptide in neg_peptides:
        print(f"{mutant_peptide} is in our negative dataset")
    else:
        print(f"{mutant_peptide} isn't in our dataset")

target/all: 5011/924412
pos/target: 2507/5011
neg/target: 2504/5011
TIQQCQSFT isn't in our dataset
TIQQCQSYT isn't in our dataset
TIQQCQSNT isn't in our dataset
TIQQCQSWT isn't in our dataset
TIQQCQSQT isn't in our dataset
TIQQLQSFT isn't in our dataset
TIQQMQSFT isn't in our dataset
TIQQNQSFT isn't in our dataset
TIQQVQSFT isn't in our dataset
TIQQHQSFT isn't in our dataset
TIQQLQSFV isn't in our dataset
TIQQLQSFA isn't in our dataset
TIQQLQSFI isn't in our dataset
TIQQHQSFI isn't in our dataset
TIQQLQSFL isn't in our dataset
TIAQLQSFL isn't in our dataset
TISQLQSFL isn't in our dataset
TISQLQSFV isn't in our dataset
TITQLQSFL isn't in our dataset
TITQLQSFV isn't in our dataset
TISHLQSFV isn't in our dataset
TITHLQSFV isn't in our dataset
TITNLQSFV isn't in our dataset
TISNLQSFV isn't in our dataset
TIAHLQSFL isn't in our dataset


In [3]:
import numpy as np
import torch
import torch.nn as nn
from TransPHLA_model import Transformer

hla_seq_dict = pd.read_csv(
                        data_path+"HLA_sequence_dict_ABCEG.csv", index_col=0
                        ).set_index(["HLA_name"])["short"].to_dict()
HLA_seq = hla_seq_dict[given_HLA]
vocab = np.load(
            data_path+"vocab_dict.npy", allow_pickle=True
            ).item()

model_path = "/data/lujd/neoag_model/main_task/"
model_name = "TransPHLA/TransPHLA_official_model.pkl"
model_eval = Transformer()
model_eval.load_state_dict(torch.load(model_path+model_name), strict = True)
model_eval.eval()

hla_max_len = 34
pep_max_len = 15
hla_token = [vocab[n] for n in HLA_seq.ljust(hla_max_len, "-")]
hla_token = torch.LongTensor([hla_token])

# initial peptide
pep_token = [vocab[n] for n in init_peptide.ljust(pep_max_len, "-")]
pep_token = torch.LongTensor([pep_token])
val_outputs, _, _, _ = model_eval(pep_token, hla_token)
y_prob_val = nn.Softmax(dim=1)(val_outputs)[:, 1].cpu().detach().numpy()
print("{}\tTransPHLA report: given HLA {}, given peptide {} | binding porbability = {:.4f}".format(1, given_HLA, init_peptide, y_prob_val.item()))

# mutate peptides
for order, mutant_peptide in enumerate(mutant_peptides):
    pep_token = [vocab[n] for n in mutant_peptide.ljust(pep_max_len, "-")]
    pep_token = torch.LongTensor([pep_token])
    val_outputs, _, _, _ = model_eval(pep_token, hla_token)
    y_prob_val = nn.Softmax(dim=1)(val_outputs)[:, 1].cpu().detach().numpy()
    print("{}\tTransPHLA report: given HLA {}, given peptide {} | binding porbability = {:.4f}".format(order+2, given_HLA, mutant_peptide, y_prob_val.item()))

1	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSPT | binding porbability = 0.0032
2	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSFT | binding porbability = 0.0001
3	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSYT | binding porbability = 0.0001
4	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSNT | binding porbability = 0.0008
5	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSWT | binding porbability = 0.0003
6	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSQT | binding porbability = 0.0000
7	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQLQSFT | binding porbability = 0.0000
8	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQMQSFT | binding porbability = 0.0004
9	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQNQSFT | binding porbability = 0.0000
10	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQVQSFT | binding porbability = 0.0000
11	TransPHLA report

# algorithm 2a -> HLA-A*02:02

In [5]:
import torch
from mutation import get_mutated_peptides

use_cuda = True     # GPU
device = torch.device("cuda:0" if (torch.cuda.is_available() and use_cuda) else "cpu")

given_HLA = "HLA-A*02:02"
init_peptide = "TIQQCQSPT"

mutant_peptides = get_mutated_peptides(
                                        given_HLA, init_peptide, device,
                                        num_mutation=5, num_peptides=5, algorithm="2a"
                                        )
# When iteration is 4, GPU(3090) takes 14s while CPU takes 170s.
# When num_mutation=3, num_peptides=10, not just one position changed at iteration-3

HLA_seq_dict preparing
Model preparing
given HLA: HLA-A*02:02, given peptide: TIQQCQSPT | binding porbability: 0.0035
************** Run algorithm-2a **************
Iteration-1, mutant_pool size: 19
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSQT | 8 P->Q | binding probability: 0.1375
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSWT | 8 P->W | binding probability: 0.1483
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSNT | 8 P->N | binding probability: 0.4029
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSYT | 8 P->Y | binding probability: 0.7901
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSFT | 8 P->F | binding probability: 0.8279
Iteration-2, mutant_pool size: 95
source peptide: TIQQCQSYT, mutated peptide: TIQQMQSYT | 5 C->M | binding probability: 0.9941
source peptide: TIQQCQSFT, mutated peptide: TIQQMQSFT | 5 C->M | binding probability: 0.9941
source peptide: TIQQCQSYT, mutated peptide: TIQQVQSYT | 5 C->V | binding probability: 0.9956
source peptide: TIQQCQS

In [6]:
given_HLA = "HLA-A*02:02"

import pandas as pd
data_path = "/data/lujd/neoag_data/main_task/"
train_data_raw = pd.read_csv(
            data_path+'train_data_fold4.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
val_data_raw = pd.read_csv(
            data_path+'val_data_fold4.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
test_data_raw = pd.read_csv(
            data_path+'independent_set.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
external_data_raw = pd.read_csv(
            data_path+'external_set.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
merged_data = pd.concat([train_data_raw, val_data_raw, test_data_raw, external_data_raw], axis=0)
target_data = merged_data[merged_data.HLA==given_HLA]
pos_peptides = target_data[target_data.label==1].peptide.to_list()
neg_peptides = target_data[target_data.label==0].peptide.to_list()
print(f"target/all: {len(target_data)}/{len(merged_data)}")
print(f"pos/target: {len(pos_peptides)}/{len(target_data)}")
print(f"neg/target: {len(neg_peptides)}/{len(target_data)}")

target/all: 5011/924412
pos/target: 2507/5011
neg/target: 2504/5011


In [7]:
for mutant_peptide in mutant_peptides:
    if mutant_peptide in pos_peptides:
        print(f"{mutant_peptide} is in our positive dataset")
    elif mutant_peptide in neg_peptides:
        print(f"{mutant_peptide} is in our negative dataset")
    else:
        print(f"{mutant_peptide} isn't in our dataset")

TIQQCQSQT isn't in our dataset
TIQQCQSWT isn't in our dataset
TIQQCQSNT isn't in our dataset
TIQQCQSYT isn't in our dataset
TIQQCQSFT isn't in our dataset
TIQQMQSYT isn't in our dataset
TIQQMQSFT isn't in our dataset
TIQQVQSYT isn't in our dataset
TIQQLQSFT isn't in our dataset
TIQQLQSYT isn't in our dataset
MIQQLQSYT isn't in our dataset
IIQQVQSYT isn't in our dataset
FIQQLQSYT isn't in our dataset
LIQQLQSYT isn't in our dataset
IIQQLQSYT isn't in our dataset
IISQLQSYT isn't in our dataset
FIQQLQSYV isn't in our dataset
FIQQLQSYA isn't in our dataset
FIQQLQSYL isn't in our dataset
FIQQLQSYI isn't in our dataset
FIAQLQSYI isn't in our dataset
IISQLQSYL isn't in our dataset
IISQLQSYI isn't in our dataset
FIQDLQSYL isn't in our dataset
FIQHLQSYL isn't in our dataset


In [8]:
import numpy as np
import torch
import torch.nn as nn
from TransPHLA_model import Transformer

hla_seq_dict = pd.read_csv(
                        data_path+"HLA_sequence_dict_ABCEG.csv", index_col=0
                        ).set_index(["HLA_name"])["short"].to_dict()
HLA_seq = hla_seq_dict[given_HLA]
vocab = np.load(
            data_path+"vocab_dict.npy", allow_pickle=True
            ).item()

model_path = "/data/lujd/neoag_model/main_task/"
model_name = "TransPHLA/TransPHLA_official_model.pkl"
model_eval = Transformer()
model_eval.load_state_dict(torch.load(model_path+model_name), strict = True)
model_eval.eval()

hla_max_len = 34
pep_max_len = 15
hla_token = [vocab[n] for n in HLA_seq.ljust(hla_max_len, "-")]
hla_token = torch.LongTensor([hla_token])

# initial peptide
pep_token = [vocab[n] for n in init_peptide.ljust(pep_max_len, "-")]
pep_token = torch.LongTensor([pep_token])
val_outputs, _, _, _ = model_eval(pep_token, hla_token)
y_prob_val = nn.Softmax(dim=1)(val_outputs)[:, 1].cpu().detach().numpy()
print("{}\tTransPHLA report: given HLA {}, given peptide {} | binding porbability = {:.4f}".format(1, given_HLA, init_peptide, y_prob_val.item()))

# mutate peptides
for order, mutant_peptide in enumerate(mutant_peptides):
    pep_token = [vocab[n] for n in mutant_peptide.ljust(pep_max_len, "-")]
    pep_token = torch.LongTensor([pep_token])
    val_outputs, _, _, _ = model_eval(pep_token, hla_token)
    y_prob_val = nn.Softmax(dim=1)(val_outputs)[:, 1].cpu().detach().numpy()
    print("{}\tTransPHLA report: given HLA {}, given peptide {} | binding porbability = {:.4f}".format(order+2, given_HLA, mutant_peptide, y_prob_val.item()))

1	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSPT | binding porbability = 0.0032
2	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSQT | binding porbability = 0.0000
3	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSWT | binding porbability = 0.0003
4	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSNT | binding porbability = 0.0008
5	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSYT | binding porbability = 0.0001
6	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSFT | binding porbability = 0.0001
7	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQMQSYT | binding porbability = 0.0000
8	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQMQSFT | binding porbability = 0.0004
9	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQVQSYT | binding porbability = 0.0000
10	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQLQSFT | binding porbability = 0.0000
11	TransPHLA report

# algorithm-2b HLA-A*02:02

In [24]:
import torch
from mutation import get_mutated_peptides

use_cuda = True    # CPU
device = torch.device("cuda:0" if (torch.cuda.is_available() and use_cuda) else "cpu")

given_HLA = "HLA-A*02:02"
init_peptide = "TIQQCQSPT"

mutant_peptides = get_mutated_peptides(
                                        given_HLA, init_peptide, device,
                                        num_mutation=5, num_peptides=5, algorithm="2b"
                                        )
# When iteration is 4, GPU(3090) takes 14s while CPU takes 170s.
# When num_mutation=3, num_peptides=10, not just one position changed at iteration-3

HLA_seq_dict preparing
Model preparing
given HLA: HLA-A*02:02, given peptide: TIQQCQSPT | binding porbability: 0.0035
************** Run algorithm-2b **************
Iteration-1, mutant_pool size: 19
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSQT | 8 P->Q | binding probability: 0.1375
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSWT | 8 P->W | binding probability: 0.1483
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSNT | 8 P->N | binding probability: 0.4029
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSYT | 8 P->Y | binding probability: 0.7901
source peptide: TIQQCQSPT, mutated peptide: TIQQCQSFT | 8 P->F | binding probability: 0.8279
Iteration-2, mutant_pool size: 95
source peptide: TIQQCQSYT, mutated peptide: TIQQMQSYT | 5 C->M | binding probability: 0.9941
source peptide: TIQQCQSFT, mutated peptide: TIQQMQSFT | 5 C->M | binding probability: 0.9941
source peptide: TIQQCQSYT, mutated peptide: TIQQVQSYT | 5 C->V | binding probability: 0.9956
source peptide: TIQQCQS

In [25]:
import pandas as pd
data_path = "/data/lujd/neoag_data/main_task/"
train_data_raw = pd.read_csv(
            data_path+'train_data_fold4.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
val_data_raw = pd.read_csv(
            data_path+'val_data_fold4.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
test_data_raw = pd.read_csv(
            data_path+'independent_set.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
external_data_raw = pd.read_csv(
            data_path+'external_set.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
merged_data = pd.concat([train_data_raw, val_data_raw, test_data_raw, external_data_raw], axis=0)
target_data = merged_data[merged_data.HLA==given_HLA]
pos_peptides = target_data[target_data.label==1].peptide.to_list()
neg_peptides = target_data[target_data.label==0].peptide.to_list()
print(f"target/all: {len(target_data)}/{len(merged_data)}")
print(f"pos/target: {len(pos_peptides)}/{len(target_data)}")
print(f"neg/target: {len(neg_peptides)}/{len(target_data)}")

for mutant_peptide in mutant_peptides:
    if mutant_peptide in pos_peptides:
        print(f"{mutant_peptide} is in our positive dataset")
    elif mutant_peptide in neg_peptides:
        print(f"{mutant_peptide} is in our negative dataset")
    else:
        print(f"{mutant_peptide} isn't in our dataset")

target/all: 5011/924412
pos/target: 2507/5011
neg/target: 2504/5011
TIQQCQSQT isn't in our dataset
TIQQCQSWT isn't in our dataset
TIQQCQSNT isn't in our dataset
TIQQCQSYT isn't in our dataset
TIQQCQSFT isn't in our dataset
TIQQMQSYT isn't in our dataset
TIQQMQSFT isn't in our dataset
TIQQVQSYT isn't in our dataset
TIQQLQSFT isn't in our dataset
TIQQLQSYT isn't in our dataset
MIQQLQSYT isn't in our dataset
IIQQVQSYT isn't in our dataset
FIQQLQSYT isn't in our dataset
LIQQLQSYT isn't in our dataset
IIQQLQSYT isn't in our dataset
FIAQLQSYT isn't in our dataset
LISQLQSYT isn't in our dataset
IIAQLQSYT isn't in our dataset
IISQVQSYT isn't in our dataset
IISQLQSYT isn't in our dataset
IISQLQSYL isn't in our dataset
IISQLQSYI isn't in our dataset
LISQLQSYL isn't in our dataset
IIAQLQSYL isn't in our dataset
FIAQLQSYL isn't in our dataset


In [26]:
import numpy as np
import torch
import torch.nn as nn
from TransPHLA_model import Transformer

hla_seq_dict = pd.read_csv(
                        data_path+"HLA_sequence_dict_ABCEG.csv", index_col=0
                        ).set_index(["HLA_name"])["short"].to_dict()
HLA_seq = hla_seq_dict[given_HLA]
vocab = np.load(
            data_path+"vocab_dict.npy", allow_pickle=True
            ).item()

model_path = "/data/lujd/neoag_model/main_task/"
model_name = "TransPHLA/TransPHLA_official_model.pkl"
model_eval = Transformer()
model_eval.load_state_dict(torch.load(model_path+model_name), strict = True)
model_eval.eval()

hla_max_len = 34
pep_max_len = 15
hla_token = [vocab[n] for n in HLA_seq.ljust(hla_max_len, "-")]
hla_token = torch.LongTensor([hla_token])

# initial peptide
pep_token = [vocab[n] for n in init_peptide.ljust(pep_max_len, "-")]
pep_token = torch.LongTensor([pep_token])
val_outputs, _, _, _ = model_eval(pep_token, hla_token)
y_prob_val = nn.Softmax(dim=1)(val_outputs)[:, 1].cpu().detach().numpy()
print("{}\tTransPHLA report: given HLA {}, given peptide {} | binding porbability = {:.4f}".format(1, given_HLA, init_peptide, y_prob_val.item()))

# mutate peptides
for order, mutant_peptide in enumerate(mutant_peptides):
    pep_token = [vocab[n] for n in mutant_peptide.ljust(pep_max_len, "-")]
    pep_token = torch.LongTensor([pep_token])
    val_outputs, _, _, _ = model_eval(pep_token, hla_token)
    y_prob_val = nn.Softmax(dim=1)(val_outputs)[:, 1].cpu().detach().numpy()
    print("{}\tTransPHLA report: given HLA {}, given peptide {} | binding porbability = {:.4f}".format(order+2, given_HLA, mutant_peptide, y_prob_val.item()))

1	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSPT | binding porbability = 0.0032
2	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSQT | binding porbability = 0.0000
3	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSWT | binding porbability = 0.0003
4	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSNT | binding porbability = 0.0008
5	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSYT | binding porbability = 0.0001
6	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQCQSFT | binding porbability = 0.0001
7	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQMQSYT | binding porbability = 0.0000
8	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQMQSFT | binding porbability = 0.0004
9	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQVQSYT | binding porbability = 0.0000
10	TransPHLA report: given HLA HLA-A*02:02, given peptide TIQQLQSFT | binding porbability = 0.0000
11	TransPHLA report

--------
# algorithm-1a HLA-A*02:01

In [12]:
import torch
from mutation import get_mutated_peptides

use_cuda = True
device = torch.device("cuda:0" if (torch.cuda.is_available() and use_cuda) else "cpu")

given_HLA = "HLA-A*02:01"
init_peptide = "DQTYNEMGD"

mutant_peptides = get_mutated_peptides(
                                        given_HLA, init_peptide, device,
                                        num_mutation=5, num_peptides=5, algorithm="1a"
                                        )

HLA_seq_dict preparing
Model preparing
given HLA: HLA-A*02:01, given peptide: DQTYNEMGD | binding porbability: 0.0000
************** Run algorithm-1a **************
Iteration-1, mutant_pool size: 171
source peptide: DQTYNEMGD, mutated peptide: DQTYNFMGD | 6 E->F | binding probability: 0.0052
source peptide: DQTYNEMGD, mutated peptide: DQTYDEMGD | 5 N->D | binding probability: 0.0180
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGV | 9 D->V | binding probability: 0.0220
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGM | 9 D->M | binding probability: 0.0407
source peptide: DQTYNEMGD, mutated peptide: DQTYNPMGD | 6 E->P | binding probability: 0.1719
Iteration-2, mutant_pool size: 752
source peptide: DQTYDEMGD, mutated peptide: DQFYDEMGD | 3 T->F | binding probability: 0.9637
source peptide: DQTYNFMGD, mutated peptide: DQTYNFMGV | 9 D->V | binding probability: 0.9709
source peptide: DQTYNPMGD, mutated peptide: DQTYNPMGE | 9 D->E | binding probability: 0.9785
source peptide: DQTYD

In [13]:
import pandas as pd
data_path = "/data/lujd/neoag_data/main_task/"
train_data_raw = pd.read_csv(
            data_path+'train_data_fold4.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
val_data_raw = pd.read_csv(
            data_path+'val_data_fold4.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
test_data_raw = pd.read_csv(
            data_path+'independent_set.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
external_data_raw = pd.read_csv(
            data_path+'external_set.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
merged_data = pd.concat([train_data_raw, val_data_raw, test_data_raw, external_data_raw], axis=0)
target_data = merged_data[merged_data.HLA==given_HLA]
pos_peptides = target_data[target_data.label==1].peptide.to_list()
neg_peptides = target_data[target_data.label==0].peptide.to_list()
print(f"target/all: {len(target_data)}/{len(merged_data)}")
print(f"pos/target: {len(pos_peptides)}/{len(target_data)}")
print(f"neg/target: {len(neg_peptides)}/{len(target_data)}")

for mutant_peptide in mutant_peptides:
    if mutant_peptide in pos_peptides:
        print(f"{mutant_peptide} is in our positive dataset")
    elif mutant_peptide in neg_peptides:
        print(f"{mutant_peptide} is in our negative dataset")
    else:
        print(f"{mutant_peptide} isn't in our dataset")

target/all: 68830/924412
pos/target: 34460/68830
neg/target: 34370/68830
DQTYNFMGD isn't in our dataset
DQTYDEMGD isn't in our dataset
DQTYNEMGV isn't in our dataset
DQTYNEMGM isn't in our dataset
DQTYNPMGD isn't in our dataset
DQFYDEMGD isn't in our dataset
DQTYNFMGV isn't in our dataset
DQTYNPMGE isn't in our dataset
DQTQDEMGD isn't in our dataset
DQTHDEMGD isn't in our dataset
DQIHDEMGD isn't in our dataset
DQTHDEMGE isn't in our dataset
DQTHDEYGD isn't in our dataset
DQTHDEFGD isn't in our dataset
HQFYDEMGD isn't in our dataset
HQFHDEMGD isn't in our dataset
HQFYDELGD isn't in our dataset
DQLHDEFGD isn't in our dataset
HQFQDEMGD isn't in our dataset
HQFEDEMGD isn't in our dataset
HQFEDEIGD isn't in our dataset
HQFEDEMAD isn't in our dataset
HQFEDELGD isn't in our dataset
HQFEDEMKD isn't in our dataset
HQFEDEMQD isn't in our dataset


In [14]:
import numpy as np
import torch
import torch.nn as nn
from TransPHLA_model import Transformer

hla_seq_dict = pd.read_csv(
                        data_path+"HLA_sequence_dict_ABCEG.csv", index_col=0
                        ).set_index(["HLA_name"])["short"].to_dict()
HLA_seq = hla_seq_dict[given_HLA]
vocab = np.load(
            data_path+"vocab_dict.npy", allow_pickle=True
            ).item()

model_path = "/data/lujd/neoag_model/main_task/"
model_name = "TransPHLA/TransPHLA_official_model.pkl"
model_eval = Transformer()
model_eval.load_state_dict(torch.load(model_path+model_name), strict = True)
model_eval.eval()

hla_max_len = 34
pep_max_len = 15
hla_token = [vocab[n] for n in HLA_seq.ljust(hla_max_len, "-")]
hla_token = torch.LongTensor([hla_token])

# initial peptide
pep_token = [vocab[n] for n in init_peptide.ljust(pep_max_len, "-")]
pep_token = torch.LongTensor([pep_token])
val_outputs, _, _, _ = model_eval(pep_token, hla_token)
y_prob_val = nn.Softmax(dim=1)(val_outputs)[:, 1].cpu().detach().numpy()
print("{}\tTransPHLA report: given HLA {}, given peptide {} | binding porbability = {:.4f}".format(1, given_HLA, init_peptide, y_prob_val.item()))

# mutate peptides
for order, mutant_peptide in enumerate(mutant_peptides):
    pep_token = [vocab[n] for n in mutant_peptide.ljust(pep_max_len, "-")]
    pep_token = torch.LongTensor([pep_token])
    val_outputs, _, _, _ = model_eval(pep_token, hla_token)
    y_prob_val = nn.Softmax(dim=1)(val_outputs)[:, 1].cpu().detach().numpy()
    print("{}\tTransPHLA report: given HLA {}, given peptide {} | binding porbability = {:.4f}".format(order+2, given_HLA, mutant_peptide, y_prob_val.item()))

1	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNEMGD | binding porbability = 0.0000
2	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNFMGD | binding porbability = 0.0000
3	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYDEMGD | binding porbability = 0.0000
4	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNEMGV | binding porbability = 0.0000
5	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNEMGM | binding porbability = 0.0000
6	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNPMGD | binding porbability = 0.0000
7	TransPHLA report: given HLA HLA-A*02:01, given peptide DQFYDEMGD | binding porbability = 0.0000
8	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNFMGV | binding porbability = 0.0061
9	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNPMGE | binding porbability = 0.0000
10	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTQDEMGD | binding porbability = 0.0000
11	TransPHLA report

# algorithm-1b HLA-A*02:01

In [4]:
import torch
from mutation import get_mutated_peptides

use_cuda = True
device = torch.device("cuda:0" if (torch.cuda.is_available() and use_cuda) else "cpu")

given_HLA = "HLA-A*02:01"
init_peptide = "DQTYNEMGD"

mutant_peptides = get_mutated_peptides(
                                        given_HLA, init_peptide, device,
                                        num_mutation=5, num_peptides=5, algorithm="1b"
                                        )

HLA_seq_dict preparing
Model preparing
given HLA: HLA-A*02:01, given peptide: DQTYNEMGD | binding porbability: 0.0000
************** Run algorithm-1b **************
Iteration-1, mutant_pool size: 171
source peptide: DQTYNEMGD, mutated peptide: DQTYNPMGD | 6 E->P | binding probability: 0.1719
source peptide: DQTYNEMGD, mutated peptide: DQTYNFMGD | 6 E->F | binding probability: 0.0052
source peptide: DQTYNEMGD, mutated peptide: DQTYNYMGD | 6 E->Y | binding probability: 0.0006
source peptide: DQTYNEMGD, mutated peptide: DQTYNWMGD | 6 E->W | binding probability: 0.0005
source peptide: DQTYNEMGD, mutated peptide: DQTYNDMGD | 6 E->D | binding probability: 0.0003
Iteration-2, mutant_pool size: 760
source peptide: DQTYNPMGD, mutated peptide: DQTYNPMGE | 9 D->E | binding probability: 0.9785
source peptide: DQTYNFMGD, mutated peptide: DQTYNFMGV | 9 D->V | binding probability: 0.9709
source peptide: DQTYNFMGD, mutated peptide: DQTYNFMGM | 9 D->M | binding probability: 0.9208
source peptide: DQTYN

In [5]:
import pandas as pd
data_path = "/data/lujd/neoag_data/main_task/"
train_data_raw = pd.read_csv(
            data_path+'train_data_fold4.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
val_data_raw = pd.read_csv(
            data_path+'val_data_fold4.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
test_data_raw = pd.read_csv(
            data_path+'independent_set.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
external_data_raw = pd.read_csv(
            data_path+'external_set.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
merged_data = pd.concat([train_data_raw, val_data_raw, test_data_raw, external_data_raw], axis=0)
target_data = merged_data[merged_data.HLA==given_HLA]
pos_peptides = target_data[target_data.label==1].peptide.to_list()
neg_peptides = target_data[target_data.label==0].peptide.to_list()
print(f"target/all: {len(target_data)}/{len(merged_data)}")
print(f"pos/target: {len(pos_peptides)}/{len(target_data)}")
print(f"neg/target: {len(neg_peptides)}/{len(target_data)}")

for mutant_peptide in mutant_peptides:
    if mutant_peptide in pos_peptides:
        print(f"{mutant_peptide} is in our positive dataset")
    elif mutant_peptide in neg_peptides:
        print(f"{mutant_peptide} is in our negative dataset")
    else:
        print(f"{mutant_peptide} isn't in our dataset")

target/all: 68830/924412
pos/target: 34460/68830
neg/target: 34370/68830
DQTYNPMGD isn't in our dataset
DQTYNFMGD isn't in our dataset
DQTYNYMGD isn't in our dataset
DQTYNWMGD isn't in our dataset
DQTYNDMGD isn't in our dataset
DQTYNPMGE isn't in our dataset
DQTYNFMGV isn't in our dataset
DQTYNFMGM isn't in our dataset
DQTYNFMGL isn't in our dataset
DQTYNFMGI isn't in our dataset
DQTYNPLGE isn't in our dataset
DQTYNFVGI isn't in our dataset
DQTYNPVGE isn't in our dataset
DQTYNFAGI isn't in our dataset
DQTYNPIGE isn't in our dataset
DQSYNPLGE isn't in our dataset
DQIYNPLGE isn't in our dataset
DQEYNPLGE isn't in our dataset
DQNYNPLGE isn't in our dataset
DQSYNPIGE isn't in our dataset
DQSVNPLGE isn't in our dataset
DQSLNPLGE isn't in our dataset
DQSKNPLGE isn't in our dataset
DQSFNPLGE isn't in our dataset
DQSINPLGE isn't in our dataset


In [6]:
import numpy as np
import torch
import torch.nn as nn
from TransPHLA_model import Transformer

hla_seq_dict = pd.read_csv(
                        data_path+"HLA_sequence_dict_ABCEG.csv", index_col=0
                        ).set_index(["HLA_name"])["short"].to_dict()
HLA_seq = hla_seq_dict[given_HLA]
vocab = np.load(
            data_path+"vocab_dict.npy", allow_pickle=True
            ).item()

model_path = "/data/lujd/neoag_model/main_task/"
model_name = "TransPHLA/TransPHLA_official_model.pkl"
model_eval = Transformer()
model_eval.load_state_dict(torch.load(model_path+model_name), strict = True)
model_eval.eval()

hla_max_len = 34
pep_max_len = 15
hla_token = [vocab[n] for n in HLA_seq.ljust(hla_max_len, "-")]
hla_token = torch.LongTensor([hla_token])

# initial peptide
pep_token = [vocab[n] for n in init_peptide.ljust(pep_max_len, "-")]
pep_token = torch.LongTensor([pep_token])
val_outputs, _, _, _ = model_eval(pep_token, hla_token)
y_prob_val = nn.Softmax(dim=1)(val_outputs)[:, 1].cpu().detach().numpy()
print("{}\tTransPHLA report: given HLA {}, given peptide {} | binding porbability = {:.4f}".format(1, given_HLA, init_peptide, y_prob_val.item()))

# mutate peptides
for order, mutant_peptide in enumerate(mutant_peptides):
    pep_token = [vocab[n] for n in mutant_peptide.ljust(pep_max_len, "-")]
    pep_token = torch.LongTensor([pep_token])
    val_outputs, _, _, _ = model_eval(pep_token, hla_token)
    y_prob_val = nn.Softmax(dim=1)(val_outputs)[:, 1].cpu().detach().numpy()
    print("{}\tTransPHLA report: given HLA {}, given peptide {} | binding porbability = {:.4f}".format(order+2, given_HLA, mutant_peptide, y_prob_val.item()))

1	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNEMGD | binding porbability = 0.0000
2	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNPMGD | binding porbability = 0.0000
3	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNFMGD | binding porbability = 0.0000
4	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNYMGD | binding porbability = 0.0000
5	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNWMGD | binding porbability = 0.0000
6	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNDMGD | binding porbability = 0.0000
7	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNPMGE | binding porbability = 0.0000
8	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNFMGV | binding porbability = 0.0061
9	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNFMGM | binding porbability = 0.0000
10	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNFMGL | binding porbability = 0.0014
11	TransPHLA report

# algorithm-2a HLA-A*02:01

In [15]:
import torch
from mutation import get_mutated_peptides

use_cuda = True
device = torch.device("cuda:0" if (torch.cuda.is_available() and use_cuda) else "cpu")

given_HLA = "HLA-A*02:01"
init_peptide = "DQTYNEMGD"

mutant_peptides = get_mutated_peptides(
                                        given_HLA, init_peptide, device,
                                        num_mutation=5, num_peptides=5, algorithm="2a"
                                        )

HLA_seq_dict preparing
Model preparing
given HLA: HLA-A*02:01, given peptide: DQTYNEMGD | binding porbability: 0.0000
************** Run algorithm-2a **************
Iteration-1, mutant_pool size: 19
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGE | 9 D->E | binding probability: 0.0012
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGI | 9 D->I | binding probability: 0.0013
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGW | 9 D->W | binding probability: 0.0039
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGV | 9 D->V | binding probability: 0.0220
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGM | 9 D->M | binding probability: 0.0407
Iteration-2, mutant_pool size: 95
source peptide: DQTYNEMGV, mutated peptide: DQTYNMMGV | 6 E->M | binding probability: 0.8842
source peptide: DQTYNEMGM, mutated peptide: DQTYNFMGM | 6 E->F | binding probability: 0.9209
source peptide: DQTYNEMGV, mutated peptide: DQTYNLMGV | 6 E->L | binding probability: 0.9546
source peptide: DQTYNEM

In [16]:
import pandas as pd
data_path = "/data/lujd/neoag_data/main_task/"
train_data_raw = pd.read_csv(
            data_path+'train_data_fold4.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
val_data_raw = pd.read_csv(
            data_path+'val_data_fold4.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
test_data_raw = pd.read_csv(
            data_path+'independent_set.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
external_data_raw = pd.read_csv(
            data_path+'external_set.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
merged_data = pd.concat([train_data_raw, val_data_raw, test_data_raw, external_data_raw], axis=0)
target_data = merged_data[merged_data.HLA==given_HLA]
pos_peptides = target_data[target_data.label==1].peptide.to_list()
neg_peptides = target_data[target_data.label==0].peptide.to_list()
print(f"target/all: {len(target_data)}/{len(merged_data)}")
print(f"pos/target: {len(pos_peptides)}/{len(target_data)}")
print(f"neg/target: {len(neg_peptides)}/{len(target_data)}")

for mutant_peptide in mutant_peptides:
    if mutant_peptide in pos_peptides:
        print(f"{mutant_peptide} is in our positive dataset")
    elif mutant_peptide in neg_peptides:
        print(f"{mutant_peptide} is in our negative dataset")
    else:
        print(f"{mutant_peptide} isn't in our dataset")

target/all: 68830/924412
pos/target: 34460/68830
neg/target: 34370/68830
DQTYNEMGE isn't in our dataset
DQTYNEMGI isn't in our dataset
DQTYNEMGW isn't in our dataset
DQTYNEMGV isn't in our dataset
DQTYNEMGM isn't in our dataset
DQTYNMMGV isn't in our dataset
DQTYNFMGM isn't in our dataset
DQTYNLMGV isn't in our dataset
DQTKNEMGE isn't in our dataset
DQTYNFMGV isn't in our dataset
FQTYNLMGV isn't in our dataset
HQTYNFMGV isn't in our dataset
LQTYNLMGV isn't in our dataset
HQTYNLMGV isn't in our dataset
DQTKNELGE isn't in our dataset
FQTENLMGV isn't in our dataset
LQTDNLMGV isn't in our dataset
HQTYNFMEV isn't in our dataset
HQTYNFMTV isn't in our dataset
FQTPNLMGV isn't in our dataset
FQFPNLMGV isn't in our dataset
FQTENLVGV isn't in our dataset
FQNPNLMGV isn't in our dataset
FQDPNLMGV isn't in our dataset
HQDYNFMTV isn't in our dataset


In [17]:
import numpy as np
import torch
import torch.nn as nn
from TransPHLA_model import Transformer

hla_seq_dict = pd.read_csv(
                        data_path+"HLA_sequence_dict_ABCEG.csv", index_col=0
                        ).set_index(["HLA_name"])["short"].to_dict()
HLA_seq = hla_seq_dict[given_HLA]
vocab = np.load(
            data_path+"vocab_dict.npy", allow_pickle=True
            ).item()

model_path = "/data/lujd/neoag_model/main_task/"
model_name = "TransPHLA/TransPHLA_official_model.pkl"
model_eval = Transformer()
model_eval.load_state_dict(torch.load(model_path+model_name), strict = True)
model_eval.eval()

hla_max_len = 34
pep_max_len = 15
hla_token = [vocab[n] for n in HLA_seq.ljust(hla_max_len, "-")]
hla_token = torch.LongTensor([hla_token])

# initial peptide
pep_token = [vocab[n] for n in init_peptide.ljust(pep_max_len, "-")]
pep_token = torch.LongTensor([pep_token])
val_outputs, _, _, _ = model_eval(pep_token, hla_token)
y_prob_val = nn.Softmax(dim=1)(val_outputs)[:, 1].cpu().detach().numpy()
print("{}\tTransPHLA report: given HLA {}, given peptide {} | binding porbability = {:.4f}".format(1, given_HLA, init_peptide, y_prob_val.item()))

# mutate peptides
for order, mutant_peptide in enumerate(mutant_peptides):
    pep_token = [vocab[n] for n in mutant_peptide.ljust(pep_max_len, "-")]
    pep_token = torch.LongTensor([pep_token])
    val_outputs, _, _, _ = model_eval(pep_token, hla_token)
    y_prob_val = nn.Softmax(dim=1)(val_outputs)[:, 1].cpu().detach().numpy()
    print("{}\tTransPHLA report: given HLA {}, given peptide {} | binding porbability = {:.4f}".format(order+2, given_HLA, mutant_peptide, y_prob_val.item()))

1	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNEMGD | binding porbability = 0.0000
2	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNEMGE | binding porbability = 0.0000
3	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNEMGI | binding porbability = 0.0002
4	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNEMGW | binding porbability = 0.0000
5	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNEMGV | binding porbability = 0.0000
6	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNEMGM | binding porbability = 0.0000
7	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNMMGV | binding porbability = 0.0011
8	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNFMGM | binding porbability = 0.0000
9	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNLMGV | binding porbability = 0.0014
10	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTKNEMGE | binding porbability = 0.0000
11	TransPHLA report

# algoritm2b HLA-A*02:01

In [18]:
import torch
from mutation import get_mutated_peptides

use_cuda = True
device = torch.device("cuda:0" if (torch.cuda.is_available() and use_cuda) else "cpu")

given_HLA = "HLA-A*02:01"
init_peptide = "DQTYNEMGD"

mutant_peptides = get_mutated_peptides(
                                        given_HLA, init_peptide, device,
                                        num_mutation=5, num_peptides=5, algorithm="2b"
                                        )

HLA_seq_dict preparing
Model preparing
given HLA: HLA-A*02:01, given peptide: DQTYNEMGD | binding porbability: 0.0000
************** Run algorithm-2b **************
Iteration-1, mutant_pool size: 19
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGE | 9 D->E | binding probability: 0.0012
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGI | 9 D->I | binding probability: 0.0013
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGW | 9 D->W | binding probability: 0.0039
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGV | 9 D->V | binding probability: 0.0220
source peptide: DQTYNEMGD, mutated peptide: DQTYNEMGM | 9 D->M | binding probability: 0.0407
Iteration-2, mutant_pool size: 95
source peptide: DQTYNEMGV, mutated peptide: DQTYNMMGV | 6 E->M | binding probability: 0.8842
source peptide: DQTYNEMGM, mutated peptide: DQTYNFMGM | 6 E->F | binding probability: 0.9209
source peptide: DQTYNEMGV, mutated peptide: DQTYNLMGV | 6 E->L | binding probability: 0.9546
source peptide: DQTYNEM

In [19]:
import pandas as pd
data_path = "/data/lujd/neoag_data/main_task/"
train_data_raw = pd.read_csv(
            data_path+'train_data_fold4.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
val_data_raw = pd.read_csv(
            data_path+'val_data_fold4.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
test_data_raw = pd.read_csv(
            data_path+'independent_set.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
external_data_raw = pd.read_csv(
            data_path+'external_set.csv', index_col=0
            )[['peptide', 'HLA', 'label']]
merged_data = pd.concat([train_data_raw, val_data_raw, test_data_raw, external_data_raw], axis=0)
target_data = merged_data[merged_data.HLA==given_HLA]
pos_peptides = target_data[target_data.label==1].peptide.to_list()
neg_peptides = target_data[target_data.label==0].peptide.to_list()
print(f"target/all: {len(target_data)}/{len(merged_data)}")
print(f"pos/target: {len(pos_peptides)}/{len(target_data)}")
print(f"neg/target: {len(neg_peptides)}/{len(target_data)}")

for mutant_peptide in mutant_peptides:
    if mutant_peptide in pos_peptides:
        print(f"{mutant_peptide} is in our positive dataset")
    elif mutant_peptide in neg_peptides:
        print(f"{mutant_peptide} is in our negative dataset")
    else:
        print(f"{mutant_peptide} isn't in our dataset")

target/all: 68830/924412
pos/target: 34460/68830
neg/target: 34370/68830
DQTYNEMGE isn't in our dataset
DQTYNEMGI isn't in our dataset
DQTYNEMGW isn't in our dataset
DQTYNEMGV isn't in our dataset
DQTYNEMGM isn't in our dataset
DQTYNMMGV isn't in our dataset
DQTYNFMGM isn't in our dataset
DQTYNLMGV isn't in our dataset
DQTYNFMGV isn't in our dataset
DQTYNPMGE isn't in our dataset
MQTYNLMGV isn't in our dataset
FQTYNLMGV isn't in our dataset
HQTYNFMGV isn't in our dataset
LQTYNLMGV isn't in our dataset
HQTYNLMGV isn't in our dataset
HQPYNLMGV isn't in our dataset
MQPYNLMGV isn't in our dataset
FQPYNLMGV isn't in our dataset
HQDYNLMGV isn't in our dataset
HQDYNFMGV isn't in our dataset
HQPYNLAGV isn't in our dataset
HQDYNLAGV isn't in our dataset
HQDYNLIGV isn't in our dataset
HQDYNLVGV isn't in our dataset
HQDYNFVGV isn't in our dataset


In [20]:
import numpy as np
import torch
import torch.nn as nn
from TransPHLA_model import Transformer

hla_seq_dict = pd.read_csv(
                        data_path+"HLA_sequence_dict_ABCEG.csv", index_col=0
                        ).set_index(["HLA_name"])["short"].to_dict()
HLA_seq = hla_seq_dict[given_HLA]
vocab = np.load(
            data_path+"vocab_dict.npy", allow_pickle=True
            ).item()

model_path = "/data/lujd/neoag_model/main_task/"
model_name = "TransPHLA/TransPHLA_official_model.pkl"
model_eval = Transformer()
model_eval.load_state_dict(torch.load(model_path+model_name), strict = True)
model_eval.eval()

hla_max_len = 34
pep_max_len = 15
hla_token = [vocab[n] for n in HLA_seq.ljust(hla_max_len, "-")]
hla_token = torch.LongTensor([hla_token])

# initial peptide
pep_token = [vocab[n] for n in init_peptide.ljust(pep_max_len, "-")]
pep_token = torch.LongTensor([pep_token])
val_outputs, _, _, _ = model_eval(pep_token, hla_token)
y_prob_val = nn.Softmax(dim=1)(val_outputs)[:, 1].cpu().detach().numpy()
print("{}\tTransPHLA report: given HLA {}, given peptide {} | binding porbability = {:.4f}".format(1, given_HLA, init_peptide, y_prob_val.item()))

# mutate peptides
for order, mutant_peptide in enumerate(mutant_peptides):
    pep_token = [vocab[n] for n in mutant_peptide.ljust(pep_max_len, "-")]
    pep_token = torch.LongTensor([pep_token])
    val_outputs, _, _, _ = model_eval(pep_token, hla_token)
    y_prob_val = nn.Softmax(dim=1)(val_outputs)[:, 1].cpu().detach().numpy()
    print("{}\tTransPHLA report: given HLA {}, given peptide {} | binding porbability = {:.4f}".format(order+2, given_HLA, mutant_peptide, y_prob_val.item()))

1	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNEMGD | binding porbability = 0.0000
2	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNEMGE | binding porbability = 0.0000
3	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNEMGI | binding porbability = 0.0002
4	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNEMGW | binding porbability = 0.0000
5	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNEMGV | binding porbability = 0.0000
6	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNEMGM | binding porbability = 0.0000
7	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNMMGV | binding porbability = 0.0011
8	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNFMGM | binding porbability = 0.0000
9	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNLMGV | binding porbability = 0.0014
10	TransPHLA report: given HLA HLA-A*02:01, given peptide DQTYNFMGV | binding porbability = 0.0061
11	TransPHLA report