# HLA peptide prediction

Building a DL model (binary classifier) that distinguish whether a peptide is a presented HLA peptide.

1. Use experimental HLA peptides to train a DL model to distinguish HLA peptides from non-HLA peptides.
2. Apply the model to classify all peptides from a fasta file with non-specific digestion.
3. Keep peptides with high predicted probabilities as HLA peptides.

### Read experimental HLA peptides

In [None]:
import pandas as pd

pep_df = pd.read_table('/User/Feng/HLA/datasets/all_HLA_seqs.txt')
save_as = '/User/Feng/HLA/model/pan_HLA_model.pth'
pep_df['nAA'] = pep_df.sequence.str.len()
pep_df['HLA'] = 1
pep_df = pep_df[(pep_df.nAA>=8)&(pep_df.nAA<=14)]
pep_df

Unnamed: 0,sequence,nAA,HLA
32,KQYDYDSSTIRKK,13,1
54,HVLEEVESLNRKY,13,1
74,KVEEAEPEEFVVEK,14,1
76,KTLPAMLGTGKL,12,1
86,GVEEEEEDGEMRE,13,1
...,...,...,...
167737,KLKKQLKIY,9,1
167738,KLIDPQTQV,9,1
167741,MATSAPLRSL,10,1
167742,GTIDEIQK,8,1


### Split data into training, validing, testing

ratio = 7:1:2

In [None]:
def split_to_two_dfs(df, ratio=0.7):
    train_df = df.sample(frac=ratio, replace=False)
    test_df = df.drop(train_df.index)
    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

pos_train_df, pos_test_df = split_to_two_dfs(pep_df, 0.7)
pos_valid_df, pos_test_df = split_to_two_dfs(pos_test_df, 0.33)

### Read fasta file

In [None]:
from alphabase.protein.fasta import load_all_proteins

protein_dict = load_all_proteins(['/User/Feng/fasta/uniprot_human_reviewed_20210309.fasta'])

In [None]:
prot_df = pd.DataFrame().from_dict(protein_dict, orient='index')
prot_df['nAA'] = prot_df.sequence.str.len()
prot_df = prot_df[prot_df.nAA>=pep_df.nAA.max()]
prot_df.reset_index(drop=True, inplace=True)
prot_df

Unnamed: 0,protein_id,full_name,gene_name,description,sequence,nAA
0,Q9H9K5,sp|Q9H9K5|MER34_HUMAN,ERVMER34-1,sp|Q9H9K5|MER34_HUMAN Endogenous retroviral en...,MGSLSNYALLQLTLTAFLTILVQPQHLLAPVFRTLSILTNQSNCWL...,563
1,P04439,sp|P04439|HLAA_HUMAN,HLA-A,sp|P04439|HLAA_HUMAN HLA class I histocompatib...,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,365
2,P01911,sp|P01911|DRB1_HUMAN,HLA-DRB1,sp|P01911|DRB1_HUMAN HLA class II histocompati...,MVCLKLPGGSCMTALTVTLMVLSSPLALSGDTRPRFLWQPKRECHF...,266
3,P01889,sp|P01889|HLAB_HUMAN,HLA-B,sp|P01889|HLAB_HUMAN HLA class I histocompatib...,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...,362
4,P31689,sp|P31689|DNJA1_HUMAN,DNAJA1,sp|P31689|DNJA1_HUMAN DnaJ homolog subfamily A...,MVKETTYYDVLGVKPNATQEELKKAYRKLALKYHPDKNPNEGEKFK...,397
...,...,...,...,...,...,...
20381,Q8WVZ7,sp|Q8WVZ7|RN133_HUMAN,RNF133,sp|Q8WVZ7|RN133_HUMAN E3 ubiquitin-protein lig...,MHLLKVGTWRNNTASSWLMKFSVLWLVSQNCCRASVVWMAYMNISF...,376
20382,P05387,sp|P05387|RLA2_HUMAN,RPLP2,sp|P05387|RLA2_HUMAN 60S acidic ribosomal prot...,MRYVASYLLAALGGNSSPSAKDIKKILDSVGIEADDDRLNKVISEL...,115
20383,P51991,sp|P51991|ROA3_HUMAN,HNRNPA3,sp|P51991|ROA3_HUMAN Heterogeneous nuclear rib...,MEVKPPPGRPQPDSGRRRRRRGEEGHDPKEPEQLRKLFIGGLSFET...,378
20384,Q9BZX4,sp|Q9BZX4|ROP1B_HUMAN,ROPN1B,sp|Q9BZX4|ROP1B_HUMAN Ropporin-1B OS=Homo sapi...,MAQTDKPTCIPPELPKMLKEFAKAAIRAQPQDLIQWGADYFEALSR...,212


### Generate non-HLA peptides

Generate random peptides from protein sequences as non-HLA peptides when training

In [None]:
import numpy as np

def get_subseq(x, pep_len):
    sequence, prot_len = x
    start = np.random.randint(0,prot_len-pep_len)
    return sequence[start:start+pep_len]

def get_random_sequences(prot_df, n, pep_len):
    return prot_df.sample(n, replace=True)[['sequence','nAA']].apply(
        get_subseq, pep_len=pep_len, axis=1
    ).values.astype('U')
get_random_sequences(prot_df, 10, 9)

array(['VLPDLLEEE', 'GDTSGYYQR', 'MSARVDAVA', 'WYSLVIRED', 'ALLLLAVAK',
       'TWNHNNMFW', 'ALAAGMQVV', 'CRGHSTKAD', 'KDAPQGCKE', 'TILALARQG'],
      dtype='<U9')

#### Concatenate HLA and non-HLA peptide DataFrames

In [None]:
def concat_neg_df(pos_df, prot_df):
    df_list = [pos_df]
    for nAA, group_df in pos_df.groupby('nAA'):
        rnd_seqs = get_random_sequences(
            prot_df, 
            n=len(group_df),
            pep_len = nAA
        )
        df_list.append(pd.DataFrame(
            dict(sequence=rnd_seqs,nAA=nAA,HLA=0)
        ))
    return pd.concat(df_list).reset_index(drop=True)
        
valid_df = concat_neg_df(pos_valid_df, prot_df)
valid_df

Unnamed: 0,sequence,nAA,HLA
0,AQSYAKRI,8,1
1,EYRGEWKEGR,10,1
2,DVAEGDLIEHF,11,1
3,EVLISRVY,8,1
4,RPTGGVGAVAL,11,1
...,...,...,...
31461,SQEVCGLRWAPDGR,14,0
31462,APSTPGRPERAARP,14,0
31463,SLEDLRTKLTVIPQ,14,0
31464,LMELGLSRMSNLSV,14,0


## Design the HLA classifier

In [None]:
import torch
import pandas as pd
import numpy as np
import tqdm
import numba

import peptdeep.model.building_block as building_block
from peptdeep.model.model_interface import ModelInterface
from peptdeep.model.featurize import get_ascii_indices

from alphabase.protein.lcp_digest import get_substring_indices
from alphabase.protein.fasta import load_all_proteins

The HLA classifier based on LSTM

In [None]:
class HLA_LSTM(torch.nn.Module):
    def __init__(self, *,
        hidden_dim=256,
        input_dim=128,
        n_lstm_layers=4,
        dropout=0.1,
    ):
        super().__init__()
        self.dropout = torch.nn.Dropout(dropout)
        
        self.nn = torch.nn.Sequential(
            torch.nn.Embedding(input_dim, hidden_dim//4),
            building_block.SeqCNN(hidden_dim//4),
            self.dropout,
            building_block.SeqLSTM(hidden_dim, hidden_dim, rnn_layer=n_lstm_layers),
            building_block.SeqAttentionSum(hidden_dim),
            self.dropout,
            torch.nn.Linear(hidden_dim,64),
            torch.nn.GELU(),
            torch.nn.Linear(64, 1),
            torch.nn.Sigmoid()
        )
    def forward(self, x):
        return self.nn(x).squeeze(-1)

The ModelInterface the interact with the HLA_LSTM

In [None]:
class HLA_ModelInterface(ModelInterface):
    def __init__(self, 
        dropout=0.1,
        model_class:torch.nn.Module=HLA_LSTM, #model defined above
        device:str='gpu',
        **kwargs,
    ):
        """
        Class to predict retention times from precursor dataframes.
        """
        super().__init__(device=device)
        self.build(
            model_class,
            dropout=dropout,
            **kwargs
        )
        self.loss_func = torch.nn.BCELoss()
        self.target_column_to_predict = 'HLA_prob_pred'

    def _prepare_predict_data_df(self,
        precursor_df:pd.DataFrame,
    ):
        self.__training = False
        precursor_df[self.target_column_to_predict] = 0.
        self.predict_df = precursor_df
        
    def _prepare_train_data_df(self, precursor_df, prot_df, **kwargs):
        self.__training = True
        self.__n_neg_per_pos = 1
        self.prot_df = prot_df

    def _get_features_from_batch_df(self, 
        batch_df: pd.DataFrame,
        **kwargs,
    ):
        aa_indices = self._as_tensor(
            get_ascii_indices(
                batch_df['sequence'].values.astype('U')
            ), dtype=torch.long
        )
        
        if self.__training:
            rnd_seqs = get_random_sequences(
                self.prot_df, 
                n=int(len(batch_df)*self.__n_neg_per_pos),
                pep_len = batch_df.nAA.values[0]
            )
            aa_indices = torch.cat(
                [aa_indices, 
                 self._as_tensor(
                     get_ascii_indices(rnd_seqs), dtype=torch.long
                 )
                ], axis=0
            )

        return aa_indices

    def _get_targets_from_batch_df(self, 
        batch_df: pd.DataFrame,
        **kwargs
    ) -> torch.Tensor:
        x = torch.zeros(
            len(batch_df)+(int(len(batch_df)*self.__n_neg_per_pos) if self.__training else 0),
            device=self.device
        )
        x[:len(batch_df)] = 1
        return x

## Build, train and predict

In [None]:
model = HLA_ModelInterface(model_class=HLA_LSTM)
model.get_parameter_num()

1669697

In [None]:
model.train(pos_train_df, prot_df=prot_df, epoch=100, warmup_epoch=20, verbose=True)

[Training] Epoch=1, lr=5e-06, loss=0.6932349891534874
[Training] Epoch=2, lr=1e-05, loss=0.6930731327405998
[Training] Epoch=3, lr=1.5e-05, loss=0.6907350452882903
[Training] Epoch=4, lr=2e-05, loss=0.6277477278241089
[Training] Epoch=5, lr=2.5e-05, loss=0.5420684284929719
[Training] Epoch=6, lr=3e-05, loss=0.4841006744120802
[Training] Epoch=7, lr=3.5e-05, loss=0.4482630943613393
[Training] Epoch=8, lr=4e-05, loss=0.4203483752374138
[Training] Epoch=9, lr=4.5e-05, loss=0.4024100367512022
[Training] Epoch=10, lr=5e-05, loss=0.39221225013690336
[Training] Epoch=11, lr=5.500000000000001e-05, loss=0.3849370711083923
[Training] Epoch=12, lr=6e-05, loss=0.3784518970974854
[Training] Epoch=13, lr=6.500000000000001e-05, loss=0.37486615936670986
[Training] Epoch=14, lr=7e-05, loss=0.36704409840915886
[Training] Epoch=15, lr=7.500000000000001e-05, loss=0.3703517203352281
[Training] Epoch=16, lr=8e-05, loss=0.36510397733322214
[Training] Epoch=17, lr=8.5e-05, loss=0.36357397745762554
[Training] 

In [None]:
model.save(save_as)

In [None]:
test_df = concat_neg_df(pos_test_df, prot_df)
model.predict(test_df)

Unnamed: 0,sequence,nAA,HLA,HLA_prob_pred
0,KTDAEFVCERTLKY,14,1,0.964588
1,KSAIPHPLIM,10,1,0.915043
2,KSLVIELDKELY,12,1,0.974512
3,EVKRLKVTELR,11,1,0.968476
4,AAEDDEDDDVDTKK,14,1,0.594708
...,...,...,...,...
63883,QKFHINIHILKDCP,14,0,0.001380
63884,HTACMMGVTFCANN,14,0,0.000554
63885,LEEAINVATAAMQQ,14,0,0.052008
63886,ASPSVQGAPREVVD,14,0,0.119104


In [None]:
prob=0.7
print("Precision =", test_df[test_df.HLA_prob_pred>prob]['HLA'].mean())
print("Recall =", test_df[test_df.HLA_prob_pred>prob]['HLA'].sum()/len(test_df)*2)
print("False Positive Rate =", 1-(1-test_df[test_df.HLA_prob_pred<prob]['HLA']).sum()/len(test_df)*2)

Precision = 0.893620613961028
Recall = 0.8283558727773603
False Positive Rate = 0.09861006761833213


## Apply the model for all protein sequences

In [None]:
cat_prot = '$'+'$'.join(prot_df.sequence.values)+'$'
len(cat_prot)

11384928

In [None]:
%%time
pos_starts, pos_ends = get_substring_indices(cat_prot, 8, 14)
digest_df = pd.DataFrame(dict(start_pos=pos_starts, end_pos=pos_ends))
digest_df["nAA"] = digest_df.end_pos-digest_df.start_pos
digest_df.sort_values('nAA', inplace=True)
digest_df.reset_index(inplace=True, drop=True)
digest_df

In [None]:
import tqdm

def get_seq(x, cat_prot=cat_prot):
    return cat_prot[slice(x[0],x[1])]

def get_seq_series(df, cat_prot):
    return df[["start_pos","end_pos"]].apply(
        get_seq, axis=1, cat_prot=cat_prot
    )

def predict_digest_df(model, substr_index_df, cat_prot, batch_size=1024000):
    substr_index_df['HLA_prob_pred'] = 0.0
    for i in tqdm.tqdm(range(0, len(substr_index_df), batch_size)):
        _df = substr_index_df.iloc[i:i+batch_size]
        seq_df = get_seq_series(_df, cat_prot).to_frame('sequence')
        seq_df['nAA'] = _df.nAA
        model.predict(seq_df, batch_size=4096)
        substr_index_df.HLA_prob_pred.values[i:i+batch_size] = seq_df.HLA_prob_pred
    return substr_index_df

In [None]:
#test
slice_df = digest_df.iloc[100:10000].copy()
predict_digest_df(model, slice_df, cat_prot, batch_size=1024)

In [None]:
digest_df = predict_digest_df(model, digest_df, cat_prot, batch_size=1024000)

Get all sequences with >prob (0.7) predicted probabilities

In [None]:
hla_df = digest_df[digest_df.HLA_prob_pred>prob].copy()
hla_df['sequence'] = get_seq_series(hla_df, cat_prot)

In [None]:
hla_df.to_csv('Predicted_HLA.tsv',index=False, sep="\t")