# Prediction of HLA-I peptides presentation

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
seq_file = r'X:\Feng\speclib_for_people\MariaW\IEDB_HLA_seqs\IEDB_HLA1_seqs.tsv'
save_as = r'X:\Feng\speclib_for_people\MariaW\HLA1_models\HLA1_IEDB.pt'
fasta = r'X:\Feng\speclib_for_people\MariaW\IEDB_HLA_seqs\UP000005640_human_reviewed.fasta'
pretrained_model = None

### transfer learning
# from peptdeep_hla.HLA_class_I import pretrained_HLA1
# pretrained_model = pretrained_HLA1

For transfer learning of sample-specific models, please set pretrained_model as an existing model file and use sample-specific sequences instead of seq_file.

In [3]:
import pandas as pd
pep_df = pd.read_csv(seq_file)
pep_df['nAA'] = pep_df.sequence.str.len()
pep_df['HLA'] = 1
pep_df

Unnamed: 0,sequence,nAA,HLA
0,AAAAAAAAAAAAA,13,1
1,AAAAPYAGW,9,1
2,AAAARAAAL,9,1
3,AAAATCALV,9,1
4,AAAKAAAAV,9,1
...,...,...,...
705738,VVVSIDRFLR,10,1
705739,VYNMVVKL,9,1
705740,WGKSKEWGRNCKGCN,15,1
705741,YASLRSLV,8,1


In [4]:
def split_to_two_dfs(df, ratio=0.8):
    train_df = df.sample(frac=ratio, replace=False)
    test_df = df.drop(train_df.index)
    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

pos_train_df, pos_test_df = split_to_two_dfs(pep_df, 0.8)

In [5]:
from peptdeep_hla.utils import get_random_sequences

def concat_neg_df(pos_df, prot_df):
    df_list = [pos_df]
    for nAA, group_df in pos_df.groupby('nAA'):
        rnd_seqs = get_random_sequences(
            prot_df, 
            n=len(group_df),
            pep_len = nAA
        )
        df_list.append(pd.DataFrame(
            dict(sequence=rnd_seqs,nAA=nAA,HLA=0)
        ))
    return pd.concat(df_list).reset_index(drop=True)

In [6]:
from peptdeep_hla.HLA_class_I import HLA_Class_I_Classifier

## Build, train and predict

In [7]:
model = HLA_Class_I_Classifier(
    fasta_files=[fasta]
)
if pretrained_model:
    model.load(pretrained_model)
model.get_parameter_num()

1669697

In [8]:
model.train(pos_train_df, epoch=100, warmup_epoch=20, verbose=True)

[Training] Epoch=1, lr=5e-06, loss=0.6934202649683322
[Training] Epoch=2, lr=1e-05, loss=0.6809309575917586
[Training] Epoch=3, lr=1.5e-05, loss=0.5204161433116445
[Training] Epoch=4, lr=2e-05, loss=0.47359454867974765
[Training] Epoch=5, lr=2.5e-05, loss=0.4567883534813827
[Training] Epoch=6, lr=3e-05, loss=0.44786587419374935
[Training] Epoch=7, lr=3.5e-05, loss=0.44158721043253846
[Training] Epoch=8, lr=4e-05, loss=0.43846351127579525
[Training] Epoch=9, lr=4.5e-05, loss=0.4339803845252631
[Training] Epoch=10, lr=5e-05, loss=0.43052031791435097
[Training] Epoch=11, lr=5.500000000000001e-05, loss=0.4305336721100897
[Training] Epoch=12, lr=6e-05, loss=0.4276179648795218
[Training] Epoch=13, lr=6.500000000000001e-05, loss=0.42716886929745945
[Training] Epoch=14, lr=7e-05, loss=0.4234656994072896
[Training] Epoch=15, lr=7.500000000000001e-05, loss=0.4201591965724837
[Training] Epoch=16, lr=8e-05, loss=0.419028694618423
[Training] Epoch=17, lr=8.5e-05, loss=0.41433650080887774
[Training]

In [9]:
model.save(save_as)

In [10]:
test_df = concat_neg_df(pos_test_df, model.protein_df)
model.predict(test_df)

Unnamed: 0,sequence,nAA,HLA,HLA_prob_pred
0,AAAAAAAAAAAAA,13,1,0.706319
1,AADKAAAAY,9,1,0.971726
2,AADLTQIFEV,10,1,0.716983
3,AAEPAALAY,9,1,0.948992
4,AAKAKAAL,8,1,0.955846
...,...,...,...,...
282293,YAMENHSLREENRRLRLLEPVKRAQEMDAQTIAKLEKAFSEISGM,45,0,0.940797
282294,LVPSPSLPRGCWQPPGSKSRPHRQGAQGHRAQVTQPSPKEPDRIK,45,0,0.828227
282295,YRVYPEGTLELRRVTAEEAGLYTCVAQNLVGADTKTVSVVVGRALL,46,0,0.981596
282296,LSLGQQLLRATADEDLQTAILLLAHGSREEVNETCGEGDGCTALHLA,47,0,0.650915


In [11]:
prob=0.7
print("Precision =", test_df[test_df.HLA_prob_pred>prob]['HLA'].mean())
print("Recall =", test_df[test_df.HLA_prob_pred>prob]['HLA'].sum()/len(test_df)*2)
print("False Positive Rate =", 1-(1-test_df[test_df.HLA_prob_pred<prob]['HLA']).sum()/len(test_df)*2)

Precision = 0.8810354403137908
Recall = 0.7192895450906489
False Positive Rate = 0.09712431543971267


## Get all sequences with >0.7 predicted probabilities

In [14]:
hla_df = model.predict_from_proteins(prob_threshold=prob)
hla_df

  lcp_array = kasai(cat_prot, suffix_array)
100%|██████████| 72/72 [15:39<00:00, 13.05s/it]


Unnamed: 0,start_pos,end_pos,nAA,HLA_prob_pred,sequence
2,4709940,4709948,8,0.747321,ASVDYIRK
3,4709941,4709949,8,0.928773,SVDYIRKL
5,4709943,4709951,8,0.797296,DYIRKLQR
13,4709951,4709959,8,0.824264,EQQRAKEL
20,4709958,4709966,8,0.700991,LENRQKKL
...,...,...,...,...,...
73207276,1232474,1232488,14,0.850721,TLLGAQPEDEAEYY
73207319,2563508,2563522,14,0.732092,VVGSRLDTPLGQTL
73207326,2563511,2563525,14,0.930226,SRLDTPLGQTLIRM
73207335,4323141,4323155,14,0.790535,MGMMNNPNPYGSPY


In [13]:
# hla_df.to_csv('Predicted_RA957_no_pretrain_HLA.tsv',index=False, sep="\t")